# Get Images from Github

In [None]:
import os, shutil, subprocess, re
import cv2



# 1) Download images (image1.jpg … image100.jpg)
INPUT_DIR = "/content/input_folder"
if os.path.exists(INPUT_DIR):
    shutil.rmtree(INPUT_DIR)
os.makedirs(INPUT_DIR, exist_ok=True)

for i in range(1, 101):
    url = f"https://raw.githubusercontent.com/DeveloperClyde246/Image-Compression-Using-DCT/master/input_folder/image{i}.jpg"
    subprocess.run(["wget", "-q", url, "-P", INPUT_DIR])

print(f"Downloaded {len(os.listdir(INPUT_DIR))} images into {INPUT_DIR}")

# Prompt for parameters
smallest   = int(input("Enter smallest square dimension (px), e.g. 8: ") or 8)
step       = int(input("Enter size increment between images (px), e.g. 24: ") or 24)
max_images = int(input("How many images to process [1–100]? ") or 100)
block_sizes = int (input("Block Size set to (e.g. 8)? : ") or 8)
retain = int (input("Retain Coefficients set to (e.g. 4)? :") or 4)

# Prepare output dir
OUTPUT_DIR = "/content/images"
if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Gather & sort by file‐size
IMG_EXTS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.gif'}
files = [
    f for f in os.listdir(INPUT_DIR)
    if os.path.splitext(f)[1].lower() in IMG_EXTS
]
files.sort(key=lambda f: os.path.getsize(os.path.join(INPUT_DIR, f)))

# Resize/rename
for idx, fname in enumerate(files[:max_images], start=1):
    src = os.path.join(INPUT_DIR, fname)
    img = cv2.imread(src)
    if img is None:
        print(f"⚠️ Skipped unreadable {fname}")
        continue

    new_dim = smallest + (idx - 1) * step
    resized = cv2.resize(img, (new_dim, new_dim), interpolation=cv2.INTER_AREA)

    out_name = f"image{idx}.jpg"
    out_path = os.path.join(OUTPUT_DIR, out_name)
    cv2.imwrite(out_path, resized, [int(cv2.IMWRITE_JPEG_QUALITY), 90])

    size = os.path.getsize(src)
    print(f"{idx:3d}. {fname} ({size} bytes) → {out_name} ({new_dim}x{new_dim})")

print(f"\nAll done! Resized images are in {OUTPUT_DIR}")


# Serial Code

## Serial Algorithm

In [None]:
# Install pybind11 and OpenCV for Python, plus system headers for C++
!pip install pybind11 opencv-python matplotlib
!apt-get update && apt-get install -y libopencv-dev


In [None]:
%%writefile dct_compression.cpp
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <opencv2/opencv.hpp>
#include <cmath>
#include <cstring>      // for memcpy
#include <stdexcept>    // for runtime_error

using namespace std;
namespace py = pybind11;

const double PI = 3.14159265358979323846;

// 2D DCT-II on an N×N block
cv::Mat dct2D(const cv::Mat& block) {
    int N = block.rows;
    cv::Mat out = cv::Mat::zeros(N, N, CV_32F);
    for (int u = 0; u < N; ++u) {
        for (int v = 0; v < N; ++v) {
            double sum = 0.0;
            for (int x = 0; x < N; ++x) {
                for (int y = 0; y < N; ++y) {
                    sum += block.at<float>(x, y)
                         * cos(PI * (2 * x + 1) * u / (2.0 * N))
                         * cos(PI * (2 * y + 1) * v / (2.0 * N));
                }
            }
            double cu = (u == 0) ? sqrt(1.0 / N) : sqrt(2.0 / N);
            double cv_ = (v == 0) ? sqrt(1.0 / N) : sqrt(2.0 / N);
            out.at<float>(u, v) = cu * cv_ * sum;
        }
    }
    return out;
}

// 2D IDCT (Type-III) on an N×N block
cv::Mat idct2D(const cv::Mat& block) {
    int N = block.rows;
    cv::Mat out = cv::Mat::zeros(N, N, CV_32F);
    for (int x = 0; x < N; ++x) {
        for (int y = 0; y < N; ++y) {
            double sum = 0.0;
            for (int u = 0; u < N; ++u) {
                for (int v = 0; v < N; ++v) {
                    double cu = (u == 0) ? sqrt(1.0 / N) : sqrt(2.0 / N);
                    double cv_ = (v == 0) ? sqrt(1.0 / N) : sqrt(2.0 / N);
                    sum += cu * cv_
                         * block.at<float>(u, v)
                         * cos(PI * (2 * x + 1) * u / (2.0 * N))
                         * cos(PI * (2 * y + 1) * v / (2.0 * N));
                }
            }
            out.at<float>(x, y) = sum;
        }
    }
    return out;
}

// Quantization: zero out high-frequency coefficients beyond 'retain'
void quantizeBlock(cv::Mat& coeffs, int block_size, int retain) {
    for (int u = 0; u < block_size; ++u) {
        for (int v = 0; v < block_size; ++v) {
            if (u >= retain || v >= retain) {
                coeffs.at<float>(u, v) = 0.0f;
            }
        }
    }
}

// Apply block-wise DCT compression
py::array_t<unsigned char> compress_with_dct(
    py::array_t<unsigned char> input,
    int block_size = 8,
    int retain = 4
) {
    auto buf = input.request();
    if (buf.ndim != 2)
        throw runtime_error("Input must be a 2D grayscale array");
    int rows = buf.shape[0];
    int cols = buf.shape[1];

    cv::Mat img(rows, cols, CV_8UC1, buf.ptr);
    cv::Mat out_f = cv::Mat::zeros(rows, cols, CV_32F);

    for (int i = 0; i + block_size <= rows; i += block_size) {
        for (int j = 0; j + block_size <= cols; j += block_size) {
            // Extract block and convert to float
            cv::Mat block = img(cv::Rect(j, i, block_size, block_size));
            cv::Mat f;
            block.convertTo(f, CV_32F);

            // Forward DCT
            cv::Mat d = dct2D(f);

            // Quantize (zero high-frequency coefficients)
            quantizeBlock(d, block_size, retain);

            // Inverse DCT
            cv::Mat id = idct2D(d);
            id.copyTo(out_f(cv::Rect(j, i, block_size, block_size)));
        }
    }

    // Convert back to 8-bit and copy to NumPy array
    cv::Mat out;
    out_f.convertTo(out, CV_8UC1);
    py::array_t<unsigned char> result({rows, cols});
    auto rbuf = result.request();
    memcpy(rbuf.ptr, out.data, rows * cols);
    return result;
}

PYBIND11_MODULE(dct_compression, m) {
    m.def("compress_with_dct", &compress_with_dct,
          "Serial DCT compression",
          py::arg("input"),
          py::arg("block_size") = 8,
          py::arg("retain") = 4);
}


In [None]:
!g++ -O3 -shared -fPIC \
    $(python3 -m pybind11 --includes) \
    dct_compression.cpp -o dct_compression.so \
    $(pkg-config --cflags --libs opencv4)


## Serial Code Call and Looping

In [None]:

import os, re, time
import cv2
import numpy as np
import matplotlib.pyplot as plt
import dct_compression
from google.colab.patches import cv2_imshow

INPUT_DIR   = '/content/images'
OUTPUT_DIR  = '/content/serial_outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# select only image<number>.<ext>, sort by number
pattern = re.compile(r'^image(\d+)\.(png|jpe?g|bmp)$', re.IGNORECASE)
files = [(int(m.group(1)), fn)
         for fn in os.listdir(INPUT_DIR)
         if (m:=pattern.match(fn))]
files.sort(key=lambda x: x[0])

serial_times, serial_sizes = [], []

for num, fn in files:
    path = os.path.join(INPUT_DIR, fn)
    img_color = cv2.imread(path)
    gray      = cv2.cvtColor(img_color, cv2.COLOR_BGR2GRAY)
    h, w      = gray.shape
    size_px   = h * w

    # run Serial DCT
    t0   = time.perf_counter()
    comp = dct_compression.compress_with_dct(gray, block_size=block_sizes, retain=retain)
    t_ms = (time.perf_counter() - t0) * 1000

    # save originals & compressed (overwrite each iteration)
    orig_j_path = os.path.join(OUTPUT_DIR, f"image{num}_orig.jpg")
    comp_j_path = os.path.join(OUTPUT_DIR, f"image{num}_comp.jpg")
    orig_p_path = os.path.join(OUTPUT_DIR, f"image{num}_orig.png")
    comp_p_path = os.path.join(OUTPUT_DIR, f"image{num}_comp.png")

    cv2.imwrite(orig_j_path, gray)
    cv2.imwrite(comp_j_path, comp)
    cv2.imwrite(orig_p_path, gray)
    cv2.imwrite(comp_p_path, comp)

    # measure file sizes
    orig_j = os.path.getsize(orig_j_path)
    comp_j = os.path.getsize(comp_j_path)
    orig_p = os.path.getsize(orig_p_path)
    comp_p = os.path.getsize(comp_p_path)

    # in‐memory sizes & raw reduction
    orig_raw     = gray.nbytes
    comp_raw     = comp.nbytes
    raw_reduction = 1 - (comp_raw / orig_raw)

    # pixel‐diff stats
    diff = np.abs(comp.astype(int) - gray.astype(int))
    diff_min, diff_max = diff.min(), diff.max()
    diff_nonzero = np.count_nonzero(diff)

    # print metrics
    print(f"--- {fn} ({h}×{w}, {size_px} px) ---")
    print(f"Time (ms):               {t_ms:.1f}")
    print(f"JPEG sizes (bytes):      orig={orig_j}, comp={comp_j}, red={100*(1-comp_j/orig_j):.1f}%")
    print(f"PNG sizes (bytes):       orig={orig_p}, comp={comp_p}, red={100*(1-comp_p/orig_p):.1f}%")
    print(f"In‐memory raw (bytes):   orig={orig_raw}, comp={comp_raw}, red={100*raw_reduction:.1f}%")
    print(f"Pixel diff min/max:      {diff_min}/{diff_max}, changed pixels={diff_nonzero}")

    # Display the actual saved JPEGs
    print("Original (JPEG) →")
    orig_j_img = cv2.imread(orig_j_path)
    cv2_imshow(orig_j_img)

    print("Compressed (JPEG) →")
    comp_j_img = cv2.imread(comp_j_path)
    cv2_imshow(comp_j_img)

    # 7) collect for final plot
    serial_times.append(t_ms)
    serial_sizes.append(size_px)

# 8) final plot: Time vs Image size
plt.figure(figsize=(8,5))
plt.plot(serial_sizes, serial_times, 'o-', label='Serial DCT')
plt.xlabel('Image Size (pixels)')
plt.ylabel('Execution Time (ms)')
plt.title('Serial DCT Compression: Time vs Image Size')
plt.grid(True)
plt.tight_layout()
plt.show()


## Serial Plot Graph

In [None]:
# plot Time vs. Image Size
plt.figure(figsize=(8,5))
plt.plot(serial_sizes, serial_times, 'o-')
plt.xlabel('Image size (pixels)')
plt.ylabel('Execution time (ms)')
plt.title('Serial DCT Compression: Time vs Image Size')
plt.grid(True)
plt.show()

# CUDA

## CUDA Algorithm

In [None]:
# 1. Install dependencies
!pip install pybind11 opencv-python matplotlib
!apt-get update && apt-get install -y libopencv-dev

In [None]:
%%writefile dct_compression_cuda.cu
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>
#include <cmath>
#include <stdexcept>
#include <vector>

namespace py = pybind11;
#define PI 3.14159265358979323846

// 2D DCT-II on an NxN block
__global__
void dct2D_kernel(const float* __restrict__ in,
                  float* __restrict__ out,
                  int rows, int cols,
                  int block_size)
{
    int bi = blockIdx.y, bj = blockIdx.x;
    int u  = threadIdx.y, v  = threadIdx.x;
    if (u >= block_size || v >= block_size) return;

    double sum = 0.0;
    int base_i = bi*block_size, base_j = bj*block_size;
    for(int x=0; x<block_size; ++x){
        for(int y=0; y<block_size; ++y){
            float pix = in[(base_i+x)*cols + (base_j+y)];
            sum += pix
                 * cos(PI*(2*x+1)*u/(2.0*block_size))
                 * cos(PI*(2*y+1)*v/(2.0*block_size));
        }
    }
    double cu = (u==0) ? sqrt(1.0/block_size) : sqrt(2.0/block_size);
    double cv = (v==0) ? sqrt(1.0/block_size) : sqrt(2.0/block_size);
    out[(base_i+u)*cols + (base_j+v)] = cu * cv * sum;
}

// Quantization: zero out coefficients beyond retain
__global__
void quantize_kernel(float* coeffs,
                     int rows, int cols,
                     int block_size,
                     int retain)
{
    int bi = blockIdx.y, bj = blockIdx.x;
    int u  = threadIdx.y, v  = threadIdx.x;
    if (u >= block_size || v >= block_size) return;
    if (u >= retain || v >= retain) {
        int i = bi*block_size + u;
        int j = bj*block_size + v;
        coeffs[i*cols + j] = 0.0f;
    }
}

// 2D IDCT (Type-III) on an NxN block
__global__
void idct2D_kernel(const float* __restrict__ in,
                   float* __restrict__ out,
                   int rows, int cols,
                   int block_size)
{
    int bi = blockIdx.y, bj = blockIdx.x;
    int x  = threadIdx.y, y  = threadIdx.x;
    if (x >= block_size || y >= block_size) return;

    double sum = 0.0;
    int base_i = bi*block_size, base_j = bj*block_size;
    for(int u=0; u<block_size; ++u){
        for(int v=0; v<block_size; ++v){
            double cu = (u==0) ? sqrt(1.0/block_size) : sqrt(2.0/block_size);
            double cv = (v==0) ? sqrt(1.0/block_size) : sqrt(2.0/block_size);
            float coeff = in[(base_i+u)*cols + (base_j+v)];
            sum += cu * cv * coeff
                 * cos(PI*(2*x+1)*u/(2.0*block_size))
                 * cos(PI*(2*y+1)*v/(2.0*block_size));
        }
    }
    out[(base_i+x)*cols + (base_j+y)] = sum;
}

// Host wrapper
void compressWithDCT_CUDA(const unsigned char* h_in_uc,
                          unsigned char*       h_out_uc,
                          int rows, int cols,
                          int block_size,
                          int retain)
{
    int N = rows * cols;
    size_t sz_f  = N * sizeof(float);

    // Convert input to float
    std::vector<float> h_in_f(N), h_buf_f(N);
    for(int i=0; i<N; i++) h_in_f[i] = (float)h_in_uc[i];

    // Allocate device buffers
    float *d_in, *d_buf;
    cudaMalloc(&d_in, sz_f);
    cudaMalloc(&d_buf, sz_f);
    cudaMemcpy(d_in, h_in_f.data(), sz_f, cudaMemcpyHostToDevice);

    dim3 grid(cols/block_size, rows/block_size);
    dim3 block(block_size, block_size);

    // 1) Forward DCT
    dct2D_kernel<<<grid,block>>>(d_in, d_buf, rows, cols, block_size);
    cudaDeviceSynchronize();

    // 2) Quantize
    quantize_kernel<<<grid,block>>>(d_buf, rows, cols, block_size, retain);
    cudaDeviceSynchronize();

    // 3) Inverse DCT
    idct2D_kernel<<<grid,block>>>(d_buf, d_in, rows, cols, block_size);
    cudaDeviceSynchronize();

    // Copy back to CPU & clamp to [0,255]
    cudaMemcpy(h_buf_f.data(), d_in, sz_f, cudaMemcpyDeviceToHost);
    for(int i=0; i<N; i++){
        float v = h_buf_f[i];
        if (v < 0)   v = 0;
        if (v > 255) v = 255;
        h_out_uc[i] = (unsigned char)v;
    }

    // Cleanup
    cudaFree(d_in);
    cudaFree(d_buf);
}

py::array_t<unsigned char>
compress_with_dct_cuda(py::array_t<unsigned char> input,
                       int block_size = 8,
                       int retain     = 4)
{
    auto buf = input.request();
    if (buf.ndim != 2)
        throw std::runtime_error("Input must be 2D (grayscale)");
    int rows = buf.shape[0], cols = buf.shape[1];
    unsigned char* h_in  = (unsigned char*) buf.ptr;

    py::array_t<unsigned char> result({rows, cols});
    auto rbuf = result.request();
    unsigned char* h_out = (unsigned char*) rbuf.ptr;

    compressWithDCT_CUDA(h_in, h_out, rows, cols, block_size, retain);
    return result;
}

PYBIND11_MODULE(dct_compression_cuda, m) {
    m.def("compress_with_dct", &compress_with_dct_cuda,
          "CUDA-accelerated DCT compression",
          py::arg("input"),
          py::arg("block_size") = 8,
          py::arg("retain")     = 4);
}

In [None]:
%%bash
nvcc -w -Xcompiler -w -O3 --compiler-options "-fPIC" \
    -gencode arch=compute_75,code=sm_75 \
    $(python3 -m pybind11 --includes) \
    dct_compression_cuda.cu \
    -o dct_compression_cuda.so \
    -shared \
    $(pkg-config --cflags --libs opencv4)

## CUDA Looping

In [None]:
# Colab Cell: Batch‐process image1→image100 with CUDA + full metrics + final plot
import os, re, time
import cv2
import numpy as np
import matplotlib.pyplot as plt
import dct_compression_cuda as dct_cuda
from google.colab.patches import cv2_imshow

# adjust if needed
INPUT_DIR   = '/content/images'
OUTPUT_DIR  = '/content/cuda_outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)


# pick only image<number>.<ext>, sort by number
pattern = re.compile(r'^image(\d+)\.(png|jpe?g|bmp)$', re.IGNORECASE)
files = sorted(
    [(int(m.group(1)), fn) for fn in os.listdir(INPUT_DIR)
                    if (m := pattern.match(fn))],
    key=lambda x: x[0]
)

cuda_times, cuda_sizes = [], []

# warm-up once
first_gray = cv2.imread(os.path.join(INPUT_DIR, files[0][1]), cv2.IMREAD_GRAYSCALE)
_ = dct_cuda.compress_with_dct(first_gray, block_size=block_sizes, retain=retain)

for num, fn in files:
    path = os.path.join(INPUT_DIR, fn)
    gray = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    h, w = gray.shape
    size_px = h * w

    # time CUDA DCT
    t0   = time.perf_counter()
    comp = dct_cuda.compress_with_dct(gray, block_size=8, retain=4)
    t_ms = (time.perf_counter() - t0) * 1000

    # save originals & compressed
    orig_j = os.path.join(OUTPUT_DIR, f"image{num}_orig.jpg")
    comp_j = os.path.join(OUTPUT_DIR, f"image{num}_comp.jpg")
    orig_p = os.path.join(OUTPUT_DIR, f"image{num}_orig.png")
    comp_p = os.path.join(OUTPUT_DIR, f"image{num}_comp.png")

    cv2.imwrite(orig_j, gray)
    cv2.imwrite(comp_j, comp)
    cv2.imwrite(orig_p, gray)
    cv2.imwrite(comp_p, comp)

    # measure on‐disk sizes
    size_orig_j = os.path.getsize(orig_j)
    size_comp_j = os.path.getsize(comp_j)
    size_orig_p = os.path.getsize(orig_p)
    size_comp_p = os.path.getsize(comp_p)

    # in-memory raw sizes
    raw_orig = gray.nbytes
    raw_comp = comp.nbytes
    raw_red  = 1 - (raw_comp / raw_orig)

    # pixel diffs
    diff       = np.abs(comp.astype(int) - gray.astype(int))
    diff_min   = diff.min()
    diff_max   = diff.max()
    diff_non0  = np.count_nonzero(diff)

    # print metrics
    print(f"--- {fn} ({h}×{w}, {size_px} px) ---")
    print(f"Time (ms):               {t_ms:.1f}")
    print(f"JPEG bytes: orig={size_orig_j}, comp={size_comp_j}, red={100*(1-size_comp_j/size_orig_j):.1f}%")
    print(f"PNG bytes:  orig={size_orig_p}, comp={size_comp_p}, red={100*(1-size_comp_p/size_orig_p):.1f}%")
    print(f"Raw bytes: orig={raw_orig}, comp={raw_comp}, red={100*raw_red:.1f}%")
    print(f"Pixel diff: min={diff_min}, max={diff_max}, changed={diff_non0}")

    # Display the actual saved JPEGs
    print("Original (JPEG) →")
    orig_j_img = cv2.imread(orig_j_path)
    cv2_imshow(orig_j_img)

    print("Compressed (JPEG) →")
    comp_j_img = cv2.imread(comp_j_path)
    cv2_imshow(comp_j_img)

    cuda_times.append(t_ms)
    cuda_sizes.append(size_px)

# final plot
plt.figure(figsize=(8,5))
plt.plot(cuda_sizes, cuda_times, 's-', label='CUDA DCT')
plt.xlabel('Image Size (pixels)')
plt.ylabel('Execution Time (ms)')
plt.title('CUDA DCT Compression: Time vs Image Size')
plt.grid(True)
plt.tight_layout()
plt.show()


## CUDA Plot Graph

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
plt.plot(cuda_sizes, cuda_times, marker='o')
plt.xlabel('Image Size (pixels)')
plt.ylabel('Elapsed Time (ms)')
plt.title('Image Size vs CUDA DCT Processing Time')
plt.grid(True)
plt.tight_layout()
plt.show()

# OpenMP

In [None]:
# Install pybind11 and OpenCV for Python, plus system headers for C++
!pip install pybind11 opencv-python matplotlib
!apt-get update && apt-get install -y libopencv-dev

In [None]:
# OpenMP
%%writefile openmp_dct.cpp
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <opencv2/opencv.hpp>
#include <cmath>
#include <cstring>    // memcpy
#include <stdexcept>
#include <omp.h>
#include <chrono>

using namespace std;
namespace py = pybind11;

const double PI = 3.14159265358979323846;

// 1) 2D DCT-II on an N×N block, with SIMD hint
cv::Mat dct2D(const cv::Mat& block) {
    int N = block.rows;
    cv::Mat out = cv::Mat::zeros(N, N, CV_32F);

    for (int u = 0; u < N; ++u) {
        for (int v = 0; v < N; ++v) {
            double sum = 0.0;
            // SIMD‐vectorize the inner loops
            #pragma omp simd reduction(+:sum)
            for (int x = 0; x < N; ++x) {
                for (int y = 0; y < N; ++y) {
                    sum += block.at<float>(x, y)
                         * cos(PI * (2*x + 1) * u / (2.0 * N))
                         * cos(PI * (2*y + 1) * v / (2.0 * N));
                }
            }
            double cu = (u == 0) ? sqrt(1.0/N) : sqrt(2.0/N);
            double cv_ = (v == 0) ? sqrt(1.0/N) : sqrt(2.0/N);
            out.at<float>(u, v) = cu * cv_ * sum;
        }
    }
    return out;
}

// 2) 2D IDCT (Type-III) on an N×N block, with SIMD hint
cv::Mat idct2D(const cv::Mat& block) {
    int N = block.rows;
    cv::Mat out = cv::Mat::zeros(N, N, CV_32F);

    for (int x = 0; x < N; ++x) {
        for (int y = 0; y < N; ++y) {
            double sum = 0.0;
            #pragma omp simd reduction(+:sum)
            for (int u = 0; u < N; ++u) {
                for (int v = 0; v < N; ++v) {
                    double cu = (u == 0) ? sqrt(1.0/N) : sqrt(2.0/N);
                    double cv_ = (v == 0) ? sqrt(1.0/N) : sqrt(2.0/N);
                    sum += cu * cv_
                         * block.at<float>(u, v)
                         * cos(PI * (2*x + 1) * u / (2.0 * N))
                         * cos(PI * (2*y + 1) * v / (2.0 * N));
                }
            }
            out.at<float>(x, y) = sum;
        }
    }
    return out;
}

// 3) Quantization: zero out coefficients beyond 'retain'
void quantizeBlock(cv::Mat& coeffs, int B, int retain) {
    for (int u = 0; u < B; ++u) {
        for (int v = 0; v < B; ++v) {
            if (u >= retain || v >= retain) {
                coeffs.at<float>(u, v) = 0.0f;
            }
        }
    }
}

// 4) Process one 8×8 block, writing directly into out_f
inline void processDctBlock(const cv::Mat& img,
                            cv::Mat& out_f,
                            int i, int j,
                            int block_size,
                            int retain) {
    cv::Mat block = img(cv::Rect(j, i, block_size, block_size));
    cv::Mat f;
    block.convertTo(f, CV_32F);

    cv::Mat d  = dct2D(f);
    quantizeBlock(d, block_size, retain);
    cv::Mat id = idct2D(d);

    id.copyTo(out_f(cv::Rect(j, i, block_size, block_size)));
}

// 5) Parallel entry-point
py::tuple compress_parallel(py::array_t<unsigned char> input,
                            int block_size   = 8,
                            int retain       = 4,
                            int num_threads  = 4,
                            int schedule_type= 1,  // 0=static,1=dynamic,2=guided
                            int chunk_size   = 16) {
    // start timing
    auto t0 = chrono::high_resolution_clock::now();

    // prepare input
    auto buf = input.request();
    if (buf.ndim != 2) throw runtime_error("Input must be 2D grayscale");
    int rows = buf.shape[0], cols = buf.shape[1];
    cv::Mat img(rows, cols, CV_8UC1, buf.ptr);
    cv::Mat out_f = cv::Mat::zeros(rows, cols, CV_32F);

    // set up OpenMP
    omp_set_num_threads(num_threads);
    static omp_sched_t scheds[3] = {
        omp_sched_static,
        omp_sched_dynamic,
        omp_sched_guided
    };
    omp_set_schedule(scheds[schedule_type], chunk_size);
    int actual_threads = omp_get_max_threads();

    // parallel block‐wise DCT
    #pragma omp parallel for collapse(2) schedule(runtime)
    for (int i = 0; i <= rows - block_size; i += block_size) {
        for (int j = 0; j <= cols - block_size; j += block_size) {
            processDctBlock(img, out_f, i, j, block_size, retain);
        }
    }

    // finalize output
    cv::Mat out;
    out_f.convertTo(out, CV_8UC1);
    auto result = py::array_t<unsigned char>({rows, cols});
    auto rbuf   = result.request();
    memcpy(rbuf.ptr, out.data, rows * cols);

    // end timing
    auto t1 = chrono::high_resolution_clock::now();
    double elapsed = chrono::duration<double>(t1 - t0).count();

    // return (image, time_seconds, threads_used)
    return py::make_tuple(result, elapsed, actual_threads);
}

PYBIND11_MODULE(openmp_dct_optimized, m) {
    m.def("compress_parallel", &compress_parallel,
          "Optimized OpenMP DCT compression",
          py::arg("input"),
          py::arg("block_size")    = 8,
          py::arg("retain")        = 4,
          py::arg("num_threads")   = 4,
          py::arg("schedule_type") = 1,
          py::arg("chunk_size")    = 16);
}

In [None]:
%%bash
g++ -O3 -std=c++11 -fopenmp -shared -fPIC \
  $(python3 -m pybind11 --includes) \
  openmp_dct.cpp -o openmp_dct_optimized.so \
  $(pkg-config --cflags --libs opencv4)

In [None]:
import os, re, time
import cv2
import numpy as np
import matplotlib.pyplot as plt
import openmp_dct_optimized as dct_omp
from google.colab.patches import cv2_imshow

# adjust if needed
INPUT_DIR   = '/content/images'
OUTPUT_DIR  = '/content/omp_outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# pick only image<number>.<ext>, sort by number
pattern = re.compile(r'^image(\d+)\.(png|jpe?g|bmp)$', re.IGNORECASE)
files = sorted(
    [(int(m.group(1)), fn) for fn in os.listdir(INPUT_DIR) if (m:=pattern.match(fn))],
    key=lambda x: x[0]
)

# parameters (make sure block_sizes & retain are set)
block_size   = block_sizes
retain       = retain
num_threads  = 4
schedule_type= 1   # dynamic
chunk_size   = 16

mp_times      = []
mp_dimensions = []

def get_kb(path):
    return os.path.getsize(path) / 1024

for num, fn in files:
    src_path = os.path.join(INPUT_DIR, fn)
    orig = cv2.imread(src_path)
    gray = cv2.cvtColor(orig, cv2.COLOR_BGR2GRAY)
    h, w = gray.shape
    dims = h * w
    orig_size_bytes = os.path.getsize(src_path)

    # Run OpenMP DCT
    t0, used_threads = time.perf_counter(), None
    comp_img, elapsed, used_threads = dct_omp.compress_parallel(
        gray, block_size, retain, num_threads, schedule_type, chunk_size
    )
    t_ms = elapsed * 1000

    # to NumPy uint8
    comp_np = np.array(comp_img, dtype=np.uint8)

    # paths for saved images
    orig_jpeg_path = os.path.join(OUTPUT_DIR, f"image{num}_orig.jpg")
    comp_jpeg_path = os.path.join(OUTPUT_DIR, f"image{num}_omp.jpg")
    orig_png_path  = os.path.join(OUTPUT_DIR, f"image{num}_orig.png")
    comp_png_path  = os.path.join(OUTPUT_DIR, f"image{num}_omp.png")

    # save originals & compressed
    cv2.imwrite(orig_jpeg_path, gray, [cv2.IMWRITE_JPEG_QUALITY, 75])
    cv2.imwrite(orig_png_path,  gray)
    cv2.imwrite(comp_jpeg_path, comp_np, [cv2.IMWRITE_JPEG_QUALITY, 75])
    cv2.imwrite(comp_png_path,  comp_np)

    # measure sizes & reductions
    comp_size_j = get_kb(comp_jpeg_path)
    orig_size_j = get_kb(orig_jpeg_path)
    comp_size_p = get_kb(comp_png_path)
    orig_size_p = get_kb(orig_png_path)
    jpeg_red = 100*(1 - comp_size_j/orig_size_j) if orig_size_j>0 else 0
    png_red  = 100*(1 - comp_size_p/orig_size_p) if orig_size_p>0 else 0
    raw_red  = 100*(1 - comp_np.nbytes/gray.nbytes)

    # pixel diffs
    diff = np.abs(comp_np.astype(int) - gray.astype(int))
    dmin, dmax = diff.min(), diff.max()
    dmean = diff.mean()

    # print metrics
    print(f"\n--- {fn} ({h}×{w}, {dims} px) ---")
    print(f"Threads used:            {used_threads}")
    print(f"Elapsed (reported):      {t_ms:.1f} ms")
    print(f"Elapsed (wall-clock):    {(time.perf_counter()-t0)*1000:.1f} ms")
    print(f"JPEG sizes (KB):         orig={orig_size_j:.1f}, comp={comp_size_j:.1f}, red={jpeg_red:.1f}%")
    print(f"PNG sizes (KB):          orig={orig_size_p:.1f}, comp={comp_size_p:.1f}, red={png_red:.1f}%")
    print(f"Raw-memory reduction:    {raw_red:.1f}%")
    print(f"Pixel diff min/mean/max: {dmin}/{dmean:.1f}/{dmax}")

    # display on-disk JPEGs
    print("Original (JPEG):")
    cv2_imshow(cv2.imread(orig_jpeg_path))
    print("Compressed (JPEG):")
    cv2_imshow(cv2.imread(comp_jpeg_path))

    mp_times.append(t_ms)
    mp_dimensions.append(dims)

# final plot
plt.figure(figsize=(8,5))
plt.plot(mp_dimensions, mp_times, 'o-', label='OpenMP DCT')
plt.xlabel('Image Size (pixels)')
plt.ylabel('Execution Time (ms)')
plt.title('OpenMP DCT Compression: Time vs Image Size')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Optional: Plot timing results
plt.figure(figsize=(10, 5))
plt.plot(mp_dimensions, mp_times, 'o-')
plt.xlabel("Image Size (pixels)")
plt.ylabel("Time (ms)")
plt.title("OpenMP DCT Compression Time per Image")
plt.grid(True)
plt.show()

# MPI

In [None]:
# Install pybind11 and OpenCV for Python, plus system headers for C++
!pip install pybind11 opencv-python matplotlib
!apt-get update && apt-get install -y libopencv-dev

In [None]:
# Install MPI and mpi4py
!apt-get update
!apt-get install -y mpich
!pip install mpi4py

In [None]:
!pip install numpy scipy pillow

In [None]:
# Update package lists and install dependencies
!apt-get update
!apt-get install -y mpich libopencv-dev python3-opencv

# Install pybind11
!pip install pybind11

In [None]:
# MPI
%%writefile dct_compression_mpi.cpp
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <cmath>
#include <cstring>      // for memcpy
#include <stdexcept>    // for runtime_error
#include <mpi.h>
#include <vector>

namespace py = pybind11;
using namespace std;

const double PI = 3.14159265358979323846;

// Pure 2D DCT (on flat array)
void dct2D(const float* input, float* output, int N) {
    const double invN = 1.0 / N;
    for (int u = 0; u < N; ++u) {
        for (int v = 0; v < N; ++v) {
            double sum = 0.0;
            for (int x = 0; x < N; ++x) {
                for (int y = 0; y < N; ++y) {
                    sum += input[x * N + y] *
                        cos(PI * (2 * x + 1) * u * 0.5 * invN) *
                        cos(PI * (2 * y + 1) * v * 0.5 * invN);
                }
            }
            double cu = (u == 0) ? sqrt(invN) : sqrt(2.0 * invN);
            double cv = (v == 0) ? sqrt(invN) : sqrt(2.0 * invN);
            output[u * N + v] = cu * cv * sum;
        }
    }
}

// Pure 2D IDCT (on flat array)
void idct2D(const float* input, float* output, int N) {
    const double invN = 1.0 / N;
    for (int x = 0; x < N; ++x) {
        for (int y = 0; y < N; ++y) {
            double sum = 0.0;
            for (int u = 0; u < N; ++u) {
                for (int v = 0; v < N; ++v) {
                    double cu = (u == 0) ? sqrt(invN) : sqrt(2.0 * invN);
                    double cv = (v == 0) ? sqrt(invN) : sqrt(2.0 * invN);
                    sum += cu * cv *
                        input[u * N + v] *
                        cos(PI * (2 * x + 1) * u * 0.5 * invN) *
                        cos(PI * (2 * y + 1) * v * 0.5 * invN);
                }
            }
            output[x * N + y] = sum;
        }
    }
}

// Quantization: zero high-frequency coefficients
void quantizeBlock(float* block, int N, int retain) {
    for (int u = 0; u < N; ++u) {
        for (int v = 0; v < N; ++v) {
            if (u >= retain || v >= retain) {
                block[u * N + v] = 0.0f;
            }
        }
    }
}

// Main function
py::array_t<unsigned char> compress_with_dct(
    py::array_t<unsigned char> input,
    int block_size = 8,
    int retain = 4
) {
    static bool mpi_initialized = false;
    int rank = 0, size = 1;
    MPI_Comm comm = MPI_COMM_WORLD;

    if (!mpi_initialized) {
        int argc = 0;
        char **argv = nullptr;
        int provided;
        int err = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
        if (err != MPI_SUCCESS) {
            throw std::runtime_error("Failed to initialize MPI");
        }
        mpi_initialized = true;
    }

    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &size);

    // Input check
    auto buf = input.request();
    if (buf.ndim != 2)
        throw runtime_error("Input must be a 2D grayscale array");
    int rows = buf.shape[0];
    int cols = buf.shape[1];

    int proc_rows = rows - (rows % block_size);
    int proc_cols = cols - (cols % block_size);

    int blocks_x = proc_cols / block_size;
    int blocks_y = proc_rows / block_size;
    int total_blocks = blocks_x * blocks_y;

    int blocks_per_proc = total_blocks / size;
    int remainder = total_blocks % size;
    vector<int> counts(size), displs(size);
    int local_blocks = (rank < remainder) ? blocks_per_proc + 1 : blocks_per_proc;

    for (int i = 0; i < size; ++i) {
        counts[i] = (i < remainder) ? blocks_per_proc + 1 : blocks_per_proc;
        displs[i] = (i == 0) ? 0 : displs[i-1] + counts[i-1];
        counts[i] *= block_size * block_size;
        displs[i] *= block_size * block_size;
    }

    vector<float> all_blocks;
    if (rank == 0) {
        all_blocks.resize(total_blocks * block_size * block_size);
        unsigned char* img_ptr = static_cast<unsigned char*>(buf.ptr);
        for (int by = 0; by < blocks_y; ++by) {
            for (int bx = 0; bx < blocks_x; ++bx) {
                for (int i = 0; i < block_size; ++i) {
                    for (int j = 0; j < block_size; ++j) {
                        int img_idx = (by * block_size + i) * cols + (bx * block_size + j);
                        int blk_idx = (by * blocks_x + bx) * block_size * block_size + i * block_size + j;
                        all_blocks[blk_idx] = static_cast<float>(img_ptr[img_idx]);
                    }
                }
            }
        }
    }

    vector<float> local_block_data(local_blocks * block_size * block_size);
    MPI_Scatterv(all_blocks.data(), counts.data(), displs.data(), MPI_FLOAT,
                 local_block_data.data(), local_blocks * block_size * block_size, MPI_FLOAT,
                 0, comm);

    vector<float> local_result_data(local_blocks * block_size * block_size);

    // === Parallel DCT compression ===
    for (int b = 0; b < local_blocks; ++b) {
        float* block = &local_block_data[b * block_size * block_size];
        float* result = &local_result_data[b * block_size * block_size];
        vector<float> dct_tmp(block_size * block_size);
        dct2D(block, dct_tmp.data(), block_size);
        quantizeBlock(dct_tmp.data(), block_size, retain);
        idct2D(dct_tmp.data(), result, block_size);
    }

    vector<float> all_result_blocks;
    if (rank == 0) {
        all_result_blocks.resize(total_blocks * block_size * block_size);
    }
    MPI_Gatherv(local_result_data.data(), local_blocks * block_size * block_size, MPI_FLOAT,
                all_result_blocks.data(), counts.data(), displs.data(), MPI_FLOAT,
                0, comm);

    py::array_t<unsigned char> result({rows, cols});
    if (rank == 0) {
        unsigned char* result_ptr = static_cast<unsigned char*>(result.request().ptr);
        for (int by = 0; by < blocks_y; ++by) {
            for (int bx = 0; bx < blocks_x; ++bx) {
                for (int i = 0; i < block_size; ++i) {
                    for (int j = 0; j < block_size; ++j) {
                        int img_idx = (by * block_size + i) * cols + (bx * block_size + j);
                        int blk_idx = (by * blocks_x + bx) * block_size * block_size + i * block_size + j;
                        float val = all_result_blocks[blk_idx];
                        val = std::min(std::max(val, 0.0f), 255.0f); // clamp
                        result_ptr[img_idx] = static_cast<unsigned char>(val);
                    }
                }
            }
        }
    }

    return result;
}

PYBIND11_MODULE(dct_compression_mpi, m) {
    m.def("compress_with_dct", &compress_with_dct,
          "MPI-parallel DCT compression (no OpenCV)",
          py::arg("input"),
          py::arg("block_size") = 8,
          py::arg("retain") = 4);
}


In [None]:
!mpicxx -shared -o dct_compression_mpi.so dct_compression_mpi.cpp \
    $(pkg-config --cflags --libs opencv4) \
    $(python3 -m pybind11 --includes) \
    -fPIC -O3

In [None]:
import os
import re
import time
import numpy as np
import cv2
import sys

sys.path.append(".")  # Add current directory to module search path

try:
    import dct_compression_mpi
except ImportError as e:
    print(f"Failed to import dct_compression_mpi: {e}")
    sys.exit(1)

from google.colab.patches import cv2_imshow

# === SETTINGS ===
INPUT_DIR = '/content/images'
OUTPUT_DIR = '/content/outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Ensure input directory exists
if not os.path.exists(INPUT_DIR):
    raise FileNotFoundError(f"Input directory {INPUT_DIR} does not exist")

# === CONTROL PARAMETERS ===
block_size = block_sizes  # Should be defined elsewhere
retain_value = retain     # Should be defined elsewhere
jpeg_quality = 75

# Compile pattern to select images like image1.png → image100.png
pattern = re.compile(r'^image(\d+)\.(png|jpe?g|bmp)$', re.IGNORECASE)
files = []
for fn in os.listdir(INPUT_DIR):
    m = pattern.match(fn)
    if m:
        files.append((int(m.group(1)), fn))
files.sort(key=lambda x: x[0])

if not files:
    print("No images found matching pattern image[1-100].(png|jpg|jpeg|bmp)")
    sys.exit(1)

# === Metrics Storage ===
mpi_times = []
mpi_dimensions = []
orig_sizes = []
comp_sizes = []
reductions = []

def get_file_size(file_path):
    """Returns the file size in bytes."""
    try:
        return os.path.getsize(file_path)
    except Exception as e:
        print(f"Error getting file size for {file_path}: {e}")
        return 0

# === MAIN PROCESSING ===
for num, fn in files:
    path = os.path.join(INPUT_DIR, fn)
    img = cv2.imread(path)
    if img is None:
        print(f"Failed to load {fn}")
        continue

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    h, w = gray.shape
    size_pixels = h * w
    orig_size_bytes = get_file_size(path)

    # Pad image to multiple of block_size
    pad_h = (block_size - h % block_size) % block_size
    pad_w = (block_size - w % block_size) % block_size
    gray_padded = np.pad(gray, ((0, pad_h), (0, pad_w)), mode='constant')

    # Run MPI-accelerated DCT → Quantize → IDCT
    try:
        t0 = time.perf_counter()
        recon = dct_compression_mpi.compress_with_dct(gray_padded,
                                                      block_size=block_size,
                                                      retain=retain_value)
        t_ms = (time.perf_counter() - t0) * 1000
    except Exception as e:
        print(f"Error processing {fn} with MPI DCT: {e}")
        continue

    # Convert reconstructed image to uint8 and crop back
    recon_np = np.array(recon, dtype=np.uint8)
    recon_np = recon_np[:h, :w]

    # === SAVE OUTPUTS ===
    orig_jpeg_path = os.path.join(OUTPUT_DIR, f"image{num}_orig.jpg")
    comp_jpeg_path = os.path.join(OUTPUT_DIR, f"image{num}_comp.jpg")
    orig_png_path  = os.path.join(OUTPUT_DIR, f"image{num}_orig.png")
    comp_png_path  = os.path.join(OUTPUT_DIR, f"image{num}_comp.png")

    # Save original as JPEG and PNG
    if not cv2.imwrite(orig_jpeg_path, gray, [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]):
        print(f"Failed to save {orig_jpeg_path}")
    if not cv2.imwrite(orig_png_path, gray):
        print(f"Failed to save {orig_png_path}")

    # Save compressed as JPEG and PNG
    if not cv2.imwrite(comp_jpeg_path, recon_np, [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]):
        print(f"Failed to save {comp_jpeg_path}")
    if not cv2.imwrite(comp_png_path, recon_np):
        print(f"Failed to save {comp_png_path}")

    # === METRICS CALCULATION ===
    compressed_size_jpeg = get_file_size(comp_jpeg_path)
    orig_size_png        = get_file_size(orig_png_path)
    compressed_size_png  = get_file_size(comp_png_path)

    # Reductions
    size_reduction_jpeg = ((orig_size_bytes - compressed_size_jpeg) / orig_size_bytes) * 100 if orig_size_bytes > 0 else 0
    size_reduction_png = ((orig_size_png - compressed_size_png) / orig_size_png) * 100 if orig_size_png > 0 else 0

    # In-memory array sizes
    orig_array_size = gray.nbytes
    comp_array_size = recon_np.nbytes
    in_memory_reduction = ((orig_array_size - comp_array_size) / orig_array_size) * 100 if orig_array_size > 0 else 0

    # Pixel difference metrics
    pixel_diffs = (gray.astype(np.float32) - recon_np.astype(np.float32))
    pixel_diff_max = np.max(pixel_diffs) if pixel_diffs.size > 0 else 0
    pixel_diff_min = np.min(pixel_diffs) if pixel_diffs.size > 0 else 0
    pixel_diff_mean = np.mean(np.abs(pixel_diffs)) if pixel_diffs.size > 0 else 0
    pixel_diff_std = np.std(pixel_diffs) if pixel_diffs.size > 0 else 0
    pixels_changed = np.count_nonzero(pixel_diffs)

    # === OUTPUT SUMMARY ===
    print(f"\n--- {fn} ({h}×{w}, {size_pixels} px) ---")
    print(f"Elapsed (MPI):             {t_ms:.2f} ms")
    print(f"JPEG sizes (bytes):         orig={orig_size_bytes}, comp={compressed_size_jpeg}")
    print(f"PNG sizes (bytes):          orig={orig_size_png}, comp={compressed_size_png}")
    print(f"PNG reduction:              {size_reduction_png:.1f}%")
    print(f"In-memory raw reduction:    {in_memory_reduction:.1f}%")
    print(f"Array sizes (bytes):        orig={orig_array_size}, comp={comp_array_size}")
    print(f"Pixel diff max/min:          {pixel_diff_max:.1f}, {pixel_diff_min:.1f}")
    print(f"Pixel diff mean/std:         {pixel_diff_mean:.1f}, {pixel_diff_std:.1f}")
    print(f"Pixels changed (non-zero):   {pixels_changed}")

    # === SHOW the on-disk JPEGs ===
    print("\nOriginal (JPEG):")
    orig_jpeg = cv2.imread(orig_jpeg_path)
    cv2_imshow(orig_jpeg)

    print("\nCompressed (JPEG):")
    comp_jpeg = cv2.imread(comp_jpeg_path)
    cv2_imshow(comp_jpeg)

    # === COLLECT DATA FOR PLOTTING ===
    mpi_times.append(t_ms)
    mpi_dimensions.append(size_pixels)
    orig_sizes.append(orig_size_bytes)
    comp_sizes.append(compressed_size_jpeg)
    reductions.append(size_reduction_jpeg)


In [None]:
import matplotlib.pyplot as plt
# Plot: Image size vs compression time
plt.figure(figsize=(10, 5))
plt.plot(mpi_dimensions, mpi_times, marker='o', linestyle='-', color='blue')
plt.xlabel('Image Size (pixels)')
plt.ylabel('Compression Time (ms)')
plt.title('(MPI) DCT Compression Time vs Image Size')
plt.grid(True)
plt.show()

# Downgrade driver version for OpenCL

In [None]:
# Install pybind11 and OpenCV for Python, plus system headers for C++
!pip install pybind11 opencv-python matplotlib
!apt-get update && apt-get install -y libopencv-dev

In [None]:
!sudo apt-mark manual ocl-icd-opencl-dev opencl-c-headers opencl-clhpp-headers

In [None]:
!sudo apt install --reinstall nvidia-driver-560

In [None]:
!sudo apt install nvidia-opencl-dev

In [None]:
!sudo apt install clinfo

In [None]:
# Verify OpenCL setup
!clinfo | grep "Number of platforms"

In [None]:
# Install pybind11
!pip install pybind11

# Install OpenCV development libraries
#!sudo apt install libopencv-dev -y
!sudo apt-get remove libopencv-dev -y
!sudo apt-get autoremove -y
!sudo apt-get update
!sudo apt-get install libopencv-dev -y

# Verify installations
#!pkg-config --cflags --libs opencv4
#!python3 -m pybind11 --includes
#!find /usr -name "libOpenCL.so*" 2>/dev/null

!pkg-config --modversion opencv4 || pkg-config --modversion opencv
!pkg-config --cflags --libs opencv4 || pkg-config --cflags --libs opencv
!find /usr -name "opencv.hpp" 2>/dev/null
!find /usr -name "libOpenCL.so*" 2>/dev/null

In [None]:
!clinfo

# OpenCL

In [None]:
# OpenCL
%%writefile dct_compression_CL.cpp
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <opencv2/opencv.hpp>
#include <cmath>
#include <cstring>
#include <stdexcept>
#include <iostream>

#define CL_TARGET_OPENCL_VERSION 120
#ifdef __cplusplus
extern "C" {
#endif
#include <CL/cl.h>
#ifdef __cplusplus
}
#endif

namespace py = pybind11;

const double PI = 3.14159265358979323846;

#define CL_CHECK(err) do { \
    if (err != CL_SUCCESS) { \
        std::cerr << "OpenCL Error at " << __FILE__ << ":" << __LINE__ << " - Error Code: " << err << std::endl; \
        throw std::runtime_error("OpenCL call failed"); \
    } \
} while (0)

py::array_t<unsigned char> compress_with_dct(
    py::array_t<unsigned char> input,
    int block_size = 8,
    int retain = 4
) {
    auto buf = input.request();
    if (buf.ndim != 2)
        throw std::runtime_error("Input must be a 2D grayscale array");
    int rows = buf.shape[0];
    int cols = buf.shape[1];

    cv::Mat img(rows, cols, CV_8UC1, buf.ptr);
    cv::Mat out_f = cv::Mat::zeros(rows, cols, CV_32F);

    cl_int err;
    cl_uint num_platforms;
    err = clGetPlatformIDs(0, nullptr, &num_platforms);
    CL_CHECK(err);
    if (num_platforms == 0) {
        throw std::runtime_error("No OpenCL platforms found.");
    }

    std::vector<cl_platform_id> platforms(num_platforms);
    err = clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
    CL_CHECK(err);

    cl_platform_id platform = platforms[0];
    cl_uint num_devices;
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices);
    CL_CHECK(err);
    if (num_devices == 0) {
        throw std::runtime_error("No OpenCL GPU devices found.");
    }

    std::vector<cl_device_id> devices(num_devices);
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices.data(), nullptr);
    CL_CHECK(err);

    cl_device_id device = devices[0];
    cl_context context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
    CL_CHECK(err);

    cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
    CL_CHECK(err);

    const std::string kernel_code = R"(
        __kernel void dct_1d(__global float* data, int size, int stride) {
            int gid = get_global_id(0);
            if (gid >= size) return;

            float sum = 0.0f;
            for (int k = 0; k < 8; k++) {
                float cu = (k == 0) ? 0.7071067811865475f : 1.0f;
                float angle = 3.141592653589793f * (2.0f * gid + 1.0f) * k / 16.0f;
                sum += data[k * stride] * cos(angle) * cu;
            }
            data[gid * stride] = sum;
        }

        __kernel void truncate(__global float* data, int retain) {
            int u = get_global_id(0);
            int v = get_global_id(1);
            if (u >= retain || v >= retain) {
                data[u * 8 + v] = 0.0f;
            }
        }

        __kernel void idct_1d(__global float* data, int size, int stride) {
            int gid = get_global_id(0);
            if (gid >= size) return;

            float sum = 0.0f;
            for (int k = 0; k < 8; k++) {
                float cu = (k == 0) ? 0.7071067811865475f : 1.0f;
                float angle = 3.141592653589793f * (2.0f * gid + 1.0f) * k / 16.0f;
                sum += cu * data[k * stride] * cos(angle);
            }
            data[gid * stride] = sum;
        }
    )";

    const char* kernel_source = kernel_code.c_str();
    cl_program program = clCreateProgramWithSource(context, 1, &kernel_source, nullptr, &err);
    CL_CHECK(err);

    err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
    if (err != CL_SUCCESS) {
        size_t log_size;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_size);
        std::vector<char> log(log_size);
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log.data(), nullptr);
        std::cerr << "Error building OpenCL program: " << log.data() << std::endl;
        throw std::runtime_error("Error building OpenCL program");
    }

    cl_mem block_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * 64, nullptr, &err);
    CL_CHECK(err);

    cl_kernel dct_row_kernel = clCreateKernel(program, "dct_1d", &err);
    CL_CHECK(err);
    cl_kernel dct_col_kernel = clCreateKernel(program, "dct_1d", &err);
    CL_CHECK(err);
    cl_kernel truncate_kernel = clCreateKernel(program, "truncate", &err);
    CL_CHECK(err);
    cl_kernel idct_row_kernel = clCreateKernel(program, "idct_1d", &err);
    CL_CHECK(err);
    cl_kernel idct_col_kernel = clCreateKernel(program, "idct_1d", &err);
    CL_CHECK(err);

    std::vector<float> block_data(64);
    for (int i = 0; i + block_size <= rows; i += block_size) {
        for (int j = 0; j + block_size <= cols; j += block_size) {
            cv::Rect blockRect(j, i, block_size, block_size);
            cv::Mat block = img(blockRect);
            cv::Mat f;
            block.convertTo(f, CV_32F);

            for (int u = 0; u < block_size; u++) {
                for (int v = 0; v < block_size; v++) {
                    block_data[u * block_size + v] = f.at<float>(u, v);
                }
            }

            err = clEnqueueWriteBuffer(queue, block_buffer, CL_TRUE, 0, sizeof(float) * 64, block_data.data(), 0, nullptr, nullptr);
            CL_CHECK(err);

            err = clSetKernelArg(dct_row_kernel, 0, sizeof(cl_mem), &block_buffer);
            CL_CHECK(err);
            err = clSetKernelArg(dct_row_kernel, 1, sizeof(int), &block_size);
            CL_CHECK(err);
            err = clSetKernelArg(dct_row_kernel, 2, sizeof(int), &block_size);
            CL_CHECK(err);
            size_t global_work_size[1] = {8};
            err = clEnqueueNDRangeKernel(queue, dct_row_kernel, 1, nullptr, global_work_size, nullptr, 0, nullptr, nullptr);
            CL_CHECK(err);

            err = clSetKernelArg(dct_col_kernel, 0, sizeof(cl_mem), &block_buffer);
            CL_CHECK(err);
            err = clSetKernelArg(dct_col_kernel, 1, sizeof(int), &block_size);
            CL_CHECK(err);
            int stride = 1;
            err = clSetKernelArg(dct_col_kernel, 2, sizeof(int), &stride);
            CL_CHECK(err);
            err = clEnqueueNDRangeKernel(queue, dct_col_kernel, 1, nullptr, global_work_size, nullptr, 0, nullptr, nullptr);
            CL_CHECK(err);

            err = clSetKernelArg(truncate_kernel, 0, sizeof(cl_mem), &block_buffer);
            CL_CHECK(err);
            err = clSetKernelArg(truncate_kernel, 1, sizeof(int), &retain);
            CL_CHECK(err);
            size_t truncate_work_size[2] = {8, 8};
            err = clEnqueueNDRangeKernel(queue, truncate_kernel, 2, nullptr, truncate_work_size, nullptr, 0, nullptr, nullptr);
            CL_CHECK(err);

            err = clSetKernelArg(idct_row_kernel, 0, sizeof(cl_mem), &block_buffer);
            CL_CHECK(err);
            err = clSetKernelArg(idct_row_kernel, 1, sizeof(int), &block_size);
            CL_CHECK(err);
            err = clSetKernelArg(idct_row_kernel, 2, sizeof(int), &block_size);
            CL_CHECK(err);
            err = clEnqueueNDRangeKernel(queue, idct_row_kernel, 1, nullptr, global_work_size, nullptr, 0, nullptr, nullptr);
            CL_CHECK(err);

            err = clSetKernelArg(idct_col_kernel, 0, sizeof(cl_mem), &block_buffer);
            CL_CHECK(err);
            err = clSetKernelArg(idct_col_kernel, 1, sizeof(int), &block_size);
            CL_CHECK(err);
            stride = 1;
            err = clSetKernelArg(idct_col_kernel, 2, sizeof(int), &stride);
            CL_CHECK(err);
            err = clEnqueueNDRangeKernel(queue, idct_col_kernel, 1, nullptr, global_work_size, nullptr, 0, nullptr, nullptr);
            CL_CHECK(err);

            err = clEnqueueReadBuffer(queue, block_buffer, CL_TRUE, 0, sizeof(float) * 64, block_data.data(), 0, nullptr, nullptr);
            CL_CHECK(err);

            for (int u = 0; u < block_size; u++) {
                for (int v = 0; v < block_size; v++) {
                    out_f.at<float>(i + u, j + v) = block_data[u * block_size + v];
                }
            }
        }
    }

    clReleaseKernel(dct_row_kernel);
    clReleaseKernel(dct_col_kernel);
    clReleaseKernel(truncate_kernel);
    clReleaseKernel(idct_row_kernel);
    clReleaseKernel(idct_col_kernel);
    clReleaseMemObject(block_buffer);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    double min_val, max_val;
    cv::minMaxLoc(out_f, &min_val, &max_val);
    if (max_val != min_val) {
        out_f = 255.0 * (out_f - min_val) / (max_val - min_val);
    } else {
        out_f = 0;
    }

    cv::Mat out;
    out_f.convertTo(out, CV_8UC1);
    py::array_t<unsigned char> result({rows, cols});
    auto rbuf = result.request();
    memcpy(rbuf.ptr, out.data, rows * cols);
    return result;
}

PYBIND11_MODULE(dct_compression_CL, m) {
    m.def("compress_with_dct", &compress_with_dct,
          "OpenCL DCT compression for grayscale images",
          py::arg("input"),
          py::arg("block_size") = 8,
          py::arg("retain") = 4);
}

In [None]:
%%bash
pkg_config_name="opencv4"
if ! pkg-config --modversion opencv4 >/dev/null 2>&1; then
    if pkg-config --modversion opencv >/dev/null 2>&1; then
        pkg_config_name="opencv"
    else
        echo "Error: Neither opencv4 nor opencv found by pkg-config"
        echo "Falling back to manual OpenCV paths"
        g++ -O3 -fPIC -shared \
            $(python3 -m pybind11 --includes) \
            dct_compression_CL.cpp \
            -o dct_compression_CL.so \
            -I/usr/include/opencv4 \
            -L/usr/lib/x86_64-linux-gnu \
            -lopencv_core -lopencv_imgproc -lopencv_highgui \
            -lOpenCL
        exit $?
    fi
fi

g++ -O3 -fPIC -shared \
    $(python3 -m pybind11 --includes) \
    dct_compression_CL.cpp \
    -o dct_compression_CL.so \
    $(pkg-config --cflags --libs $pkg_config_name) \
    -lOpenCL

In [None]:
import cv2
import numpy as np
from PIL import Image
import os
import time
import dct_compression_CL
from google.colab import files as colab_files
import io
import zipfile
import shutil
import matplotlib.pyplot as plt
import glob
import re
import sys
from google.colab.patches import cv2_imshow  # For Colab display

def get_file_size(file_path):
    return os.path.getsize(file_path) / 1024

def dct_compress(image_path, output_path, base_quality=75, block_size=8, retain=4):
    start_time = time.time()

    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise ValueError(f"Could not load image: {image_path}")

    orig_size = get_file_size(image_path)
    if orig_size < 0:
        print(f"Skipping compression for {image_path} (size {orig_size:.1f} KB is too small)")
        cv2.imwrite(output_path, img, [int(cv2.IMWRITE_JPEG_QUALITY), base_quality])
        compression_time = (time.time() - start_time) * 1000
        return img, compression_time, 0  # Return 0 for dimensions if skipped

    h, w = img.shape
    h = h - (h % block_size)
    w = w - (w % block_size)
    img = img[:h, :w]
    dimensions = h * w

    try:
        compressed = dct_compression_CL.compress_with_dct(img, block_size=block_size, retain=retain)
        # Apply a stronger Gaussian blur to reduce the grid and make the image fuzzier
        compressed = cv2.GaussianBlur(compressed, (15, 15), 5)
        # Apply a bilateral filter to further smooth out the grid while preserving some edges
        compressed = cv2.bilateralFilter(compressed, 9, 75, 75)
    except Exception as e:
        print(f"OpenCL compression failed: {e}")
        return None, None, 0

    # Save the original image as JPEG with base quality
    cv2.imwrite('orig.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), base_quality])
    orig_j = os.path.getsize('orig.jpg')

    # Dynamically adjust JPEG quality for the compressed image
    quality = base_quality  # Start with the base quality
    cv2.imwrite(output_path, compressed, [int(cv2.IMWRITE_JPEG_QUALITY), quality])
    cmp_j = os.path.getsize(output_path)

    # Decrease quality until the compressed file size is less than or equal to the original
    while cmp_j > orig_j and quality > 10:  # Stop at a minimum quality of 10
        quality -= 5  # Decrease quality in steps
        cv2.imwrite(output_path, compressed, [int(cv2.IMWRITE_JPEG_QUALITY), quality])
        cmp_j = os.path.getsize(output_path)

    compression_time = (time.time() - start_time) * 1000

    # Save PNG versions for additional comparison
    cv2.imwrite('orig.png', img)
    cv2.imwrite('comp.png', compressed)

    orig_p = os.path.getsize('orig.png')
    cmp_p = os.path.getsize('comp.png')

    orig_raw = img.nbytes
    comp_raw = compressed.nbytes
    raw_reduction = 1 - (comp_raw / orig_raw)

    # Calculate pixel-difference statistics
    diff = np.abs(compressed.astype(float) - img.astype(float))
    diff_mean = np.mean(diff)  # Mean absolute difference
    diff_std = np.std(diff)  # Standard deviation of differences

    print("\n--- Output Summary ---")
    print(f"Elapsed (OpenCL):          {compression_time:.2f} ms")
    print(f"JPEG quality used (comp):  {quality}")
    print(f"JPEG sizes (bytes):        orig={orig_j}, comp={cmp_j}")
    print(f"PNG sizes (bytes):         orig={orig_p}, comp={cmp_p}")
    print(f"JPEG reduction:            {100*(1 - cmp_j/orig_j):.1f}%")
    print(f"PNG reduction:             {100*(1 - cmp_p/orig_p):.1f}%")
    print(f"In-memory raw reduction:   {100*raw_reduction:.1f}%")
    print(f"Array sizes (bytes):       orig={orig_raw}, comp={comp_raw}")
    print(f"Pixel diff max/min:        {diff.max():.1f}, {diff.min():.1f}")
    print(f"Pixel diff mean/std:       {diff_mean:.1f}, {diff_std:.1f}")
    print(f"Pixels changed (non-zero): {np.count_nonzero(diff)}")
    print("\n----------------------")
    # ====================================

    return compressed, compression_time, dimensions


# Set up directories and load images
INPUT_DIR = '/content/images'
OUTPUT_DIR = '/content/outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)

compressed_folder = OUTPUT_DIR

# Ensure input directory exists
if not os.path.exists(INPUT_DIR):
    raise FileNotFoundError(f"Input directory {INPUT_DIR} does not exist")

# Select image1.png → image100.png
pattern = re.compile(r'^image(\d+)\.(png|jpe?g|bmp)$', re.IGNORECASE)
image_files = []
for fn in os.listdir(INPUT_DIR):
    m = pattern.match(fn)
    if m:
        idx = int(m.group(1))
        image_files.append((idx, fn))
image_files.sort(key=lambda x: x[0])

# Check if files were found
if not image_files:
    print("No images found matching the pattern image[1-100].(png|jpg|jpeg|bmp)")
    sys.exit(1)

# Prepare the list of image files with their paths
image_files = [(f"image{idx}.{fn.split('.')[-1]}", os.path.join(INPUT_DIR, fn)) for idx, fn in image_files]

# Lists to store data
CL_dimensions = []
CL_times = []

for idx, (filename, original_path) in enumerate(image_files, 1):
    img = cv2.imread(original_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        print(f"Could not load {filename} to calculate pixel count. Skipping.")
        continue

    output_filename = f"compressed_{filename}"
    output_path = os.path.join(compressed_folder, output_filename)
    print(f"Saving {filename} to {output_path}")
    try:
        compressed, compression_time, dimensions = dct_compress(original_path, output_path, base_quality=75, block_size=8, retain=4)
        if compressed is None or compression_time is None:
            continue

        # Display the images only
        print("Original Image:")
        cv2_imshow(img)  # Original image is not blurred
        print("Compressed Image:")
        cv2_imshow(compressed)  # Compressed image is blurred

        CL_dimensions.append(dimensions)
        CL_times.append(compression_time)
    except Exception as e:
        print(f"Error compressing {filename}: {e}")

zip_filename = "output_folder2.zip"
shutil.make_archive("output_folder2", 'zip', compressed_folder)
print(f"Downloading all compressed images as {zip_filename}")
colab_files.download(zip_filename)

In [None]:
# import os
# import re
# import time
# import numpy as np
# import cv2
# import sys
# import shutil
# import zipfile
# import matplotlib.pyplot as plt
# import dct_compression_CL
# from google.colab import files as colab_files
# from google.colab.patches import cv2_imshow

# # Helpers
# def get_file_size_kb(path):
#     return os.path.getsize(path) / 1024

# def dct_compress(image_path, orig_jpeg_path, comp_jpeg_path,
#                  base_quality=75, block_size=8, retain=4):
#     start = time.time()
#     img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
#     if img is None:
#         raise IOError(f"Cannot load {image_path}")

#     # Crop to multiple of block_size
#     h, w = img.shape
#     h -= h % block_size
#     w -= w % block_size
#     img = img[:h, :w]
#     dims = h * w

#     # Run DCT→quantize→IDCT
#     compressed = dct_compression_CL.compress_with_dct(img,
#                           block_size=block_size, retain=retain)
#     elapsed_ms = (time.time() - start) * 1000

#     # Save original & compressed as JPEG
#     cv2.imwrite(orig_jpeg_path, img,
#                 [int(cv2.IMWRITE_JPEG_QUALITY), base_quality])
#     # Dynamically lower quality if needed
#     quality = base_quality
#     cv2.imwrite(comp_jpeg_path, compressed,
#                 [int(cv2.IMWRITE_JPEG_QUALITY), quality])
#     orig_j = os.path.getsize(orig_jpeg_path)
#     comp_j = os.path.getsize(comp_jpeg_path)
#     while comp_j > orig_j and quality > 10:
#         quality -= 5
#         cv2.imwrite(comp_jpeg_path, compressed,
#                     [int(cv2.IMWRITE_JPEG_QUALITY), quality])
#         comp_j = os.path.getsize(comp_jpeg_path)

#     return img, compressed, elapsed_ms, dims, quality, orig_j, comp_j

# # Directories
# INPUT_DIR  = '/content/images'
# OUTPUT_DIR = '/content/outputs'
# os.makedirs(OUTPUT_DIR, exist_ok=True)

# # Find & sort images
# pattern = re.compile(r'^image(\d+)\.(png|jpe?g|bmp)$', re.IGNORECASE)
# files = sorted(
#     [(int(m.group(1)), fn)
#      for fn in os.listdir(INPUT_DIR)
#      if (m := pattern.match(fn))],
#     key=lambda x: x[0]
# )

# # Metrics lists
# CL_dimensions = []
# CL_times      = []

# # Process each
# for num, fn in files:
#     img_path = os.path.join(INPUT_DIR, fn)
#     orig_jpeg = os.path.join(OUTPUT_DIR, f"image{num}_orig.jpg")
#     comp_jpeg = os.path.join(OUTPUT_DIR, f"image{num}_comp.jpg")

#     try:
#         orig_img, comp_img, t_ms, dims, qual, size_o, size_c = \
#             dct_compress(img_path, orig_jpeg, comp_jpeg,
#                          base_quality=75, block_size=8, retain=4)
#     except Exception as e:
#         print(f"Error on {fn}: {e}")
#         continue

#     # Compute PNG sizes for comparison
#     orig_png = orig_jpeg.replace('.jpg', '.png')
#     comp_png = comp_jpeg.replace('.jpg', '.png')
#     cv2.imwrite(orig_png, orig_img)
#     cv2.imwrite(comp_png, comp_img)
#     size_p_o = os.path.getsize(orig_png)
#     size_p_c = os.path.getsize(comp_png)

#     # In-memory raw reduction
#     raw_o = orig_img.nbytes
#     raw_c = comp_img.nbytes
#     raw_red = 100 * (1 - raw_c/raw_o)

#     # Pixel-difference stats
#     diff = np.abs(comp_img.astype(float) - orig_img.astype(float))
#     d_min, d_max = diff.min(), diff.max()
#     d_mean, d_std = diff.mean(), diff.std()
#     changed = np.count_nonzero(diff)

#     # Print summary
#     print(f"\n--- {fn} ({dims} px) ---")
#     print(f"Elapsed (OpenCL):         {t_ms:.1f} ms")
#     print(f"Final JPEG quality:       {qual}")
#     print(f"JPEG sizes (bytes):       orig={size_o}, comp={size_c}, red={100*(1-size_c/size_o):.1f}%")
#     print(f"PNG sizes (bytes):        orig={size_p_o}, comp={size_p_c}, red={100*(1-size_p_c/size_p_o):.1f}%")
#     print(f"In-memory raw reduction:  {raw_red:.1f}%")
#     print(f"Pixel diff min/max:       {d_min:.1f}/{d_max:.1f}")
#     print(f"Pixel diff mean/std:      {d_mean:.1f}/{d_std:.1f}")
#     print(f"Pixels changed:           {changed}")

#     # Display the on-disk JPEGs
#     print("Original (JPEG):")
#     cv2_imshow(cv2.imread(orig_jpeg))
#     print("Compressed (JPEG):")
#     cv2_imshow(cv2.imread(comp_jpeg))

#     # Collect for plotting
#     CL_dimensions.append(dims)
#     CL_times.append(t_ms)

# # (Optional) Plot OpenCL timings vs image size
# plt.figure(figsize=(8,5))
# plt.plot(CL_dimensions, CL_times, 'o-', label='OpenCL DCT')
# plt.xlabel('Image Size (pixels)')
# plt.ylabel('Execution Time (ms)')
# plt.title('OpenCL DCT Compression: Time vs Image Size')
# plt.grid(True)
# plt.tight_layout()
# plt.show()

# # Zip & download all compressed files
# zip_name = 'opencl_outputs.zip'
# shutil.make_archive('opencl_outputs', 'zip', OUTPUT_DIR)
# print(f"Downloading {zip_name}…")
# colab_files.download(zip_name)


In [None]:
import matplotlib.pyplot as plt

# Ensure there's data to plot
if not CL_dimensions or not CL_times:
    print("No data available to generate the scatter plot.")
else:
    # Sort CL_dimensions and CL_times by CL_dimensions to ensure proper sequence
    sorted_data = sorted(zip(CL_dimensions, CL_times), key=lambda x: x[0])
    OpenCL_dimensions, OpenCL_times = zip(*sorted_data)

    plt.figure(figsize=(6,4))
    plt.plot(OpenCL_dimensions, OpenCL_times, marker='o')
    plt.xlabel('Image Size (pixels)')
    plt.ylabel('Elapsed Time (ms)')
    plt.title('Image Size vs CL DCT Processing Time')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('cl_dimensions_vs_times_scatter.png')
    plt.show()

# Comparisons

In [None]:
# TODO:
# Time graph for each methodologies
import matplotlib.pyplot as plt

# Assuming you have these lists already defined:
# serial_sizes, serial_times, cuda_dimensions, cuda_times


plt.figure(figsize=(10, 5))
plt.plot(serial_sizes, serial_times, marker='o', linestyle='-', label='Serial DCT')
plt.plot(cuda_sizes, cuda_times, marker='s', linestyle='--', label='CUDA DCT')
plt.plot(mp_dimensions, mp_times, marker='D', linestyle='-.', label='OpenMP DCT')
plt.plot(mpi_dimensions, mpi_times, marker='*', linestyle=':', label='MPI DCT')
plt.plot(OpenCL_dimensions, OpenCL_times, marker='x', linestyle='-.', label='OpenCL DCT')
plt.xlabel('Image Size (pixels)')
plt.ylabel('Execution Time (ms)')
plt.title('Serial vs CUDA: DCT Compression Time')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# Graph for Speedup Percentage
# (Serial vs CUDA time improvement for each image) vs
# (Serial vs OpenCL time improvement for each image) vs
# (Serial vs OpenMP time improvement for each image) vs
# (Serial vs PthreadsL time improvement for each image)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Ensure there's data to plot
if not (serial_sizes and serial_times and cuda_sizes and cuda_times and
        OpenCL_dimensions and OpenCL_times and mp_dimensions and mp_times and
        mpi_dimensions and mpi_times):
    print("No data available to generate the speedup chart.")
else:
    # Calculate speedup percentages using (Serial Time / Parallel Time - 1) * 100
    # Add a small epsilon to avoid division by zero
    epsilon = 1e-6
    cuda_speedup = [((serial_times[i] / (cuda_times[i] + epsilon)) - 1) * 100
                    for i in range(len(serial_times))]
    opencl_speedup = [((serial_times[i] / (OpenCL_times[i] + epsilon)) - 1) * 100
                      for i in range(len(serial_times))]
    openmp_speedup = [((serial_times[i] / (mp_times[i] + epsilon)) - 1) * 100
                      for i in range(len(serial_times))]
    pthreads_speedup = [((serial_times[i] / (mpi_times[i] + epsilon)) - 1) * 100
                        for i in range(len(serial_times))]

    # Sort data by image size for consistent plotting
    sorted_data = sorted(zip(serial_sizes, cuda_speedup, opencl_speedup, openmp_speedup, pthreads_speedup),
                         key=lambda x: x[0])
    sorted_sizes, sorted_cuda_speedup, sorted_opencl_speedup, sorted_openmp_speedup, sorted_pthreads_speedup = zip(*sorted_data)

    # Plot speedup percentages
    plt.figure(figsize=(10, 5))
    plt.plot(sorted_sizes, sorted_cuda_speedup, marker='s', linestyle='--', label='CUDA Speedup')
    plt.plot(sorted_sizes, sorted_opencl_speedup, marker='x', linestyle='-.', label='OpenCL Speedup')
    plt.plot(sorted_sizes, sorted_openmp_speedup, marker='D', linestyle='-.', label='OpenMP Speedup')
    plt.plot(sorted_sizes, sorted_pthreads_speedup, marker='*', linestyle=':', label='Mpi Speedup')
    plt.xlabel('Image Size (pixels)')
    plt.ylabel('Speedup Percentage (%)')
    plt.title('Speedup Percentage: Serial vs Parallel Techniques')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('speedup_percentage_chart.png')
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Check if all necessary data exists
if not (serial_sizes and serial_times and cuda_sizes and cuda_times and
        OpenCL_dimensions and OpenCL_times and mp_dimensions and mp_times and
        mpi_dimensions and mpi_times):
    print("No data available to generate the speedup chart.")
else:
    # Calculate speedup percentages
    cuda_speedup = [(serial_times[i] - cuda_times[i]) / serial_times[i] * 100 for i in range(len(serial_times))]
    opencl_speedup = [(serial_times[i] - OpenCL_times[i]) / serial_times[i] * 100 for i in range(len(serial_times))]
    openmp_speedup = [(serial_times[i] - mp_times[i]) / serial_times[i] * 100 for i in range(len(serial_times))]
    pthreads_speedup = [(serial_times[i] - mpi_times[i]) / serial_times[i] * 100 for i in range(len(serial_times))]

    # Sort data by image size
    sorted_data = sorted(zip(serial_sizes, cuda_speedup, opencl_speedup, openmp_speedup, pthreads_speedup),
                         key=lambda x: x[0])
    sorted_sizes, sorted_cuda_speedup, sorted_opencl_speedup, sorted_openmp_speedup, sorted_pthreads_speedup = zip(*sorted_data)

    # Plot
    plt.figure(figsize=(12, 6))
    plt.plot(sorted_sizes, sorted_cuda_speedup, marker='s', linestyle='--', label='CUDA Speedup')
    plt.plot(sorted_sizes, sorted_opencl_speedup, marker='x', linestyle='-.', label='OpenCL Speedup')
    plt.plot(sorted_sizes, sorted_openmp_speedup, marker='D', linestyle='-', label='OpenMP Speedup')
    plt.plot(sorted_sizes, sorted_pthreads_speedup, marker='*', linestyle=':', label='MPI Speedup')

    # Adjust axis
    plt.ylim(-500, 110)  # Match the range like the reference (you sent)
    plt.xlim(0, max(sorted_sizes))  # Fit all image sizes

    # Labels, title, grid
    plt.xlabel('Image Size (pixels)', fontsize=12)
    plt.ylabel('Speedup Percentage (%)', fontsize=12)
    plt.title('Speedup Percentage: Serial vs Parallel Techniques', fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    # Save and show
    plt.savefig('speedup_percentage_chart_adjusted.png')
    plt.show()
