In [1]:
import os, cv2, math, json, shutil, random
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

import dask, dask.bag as db, dask.dataframe as dd, dask.array as da
from dask import delayed, compute
from dask.distributed import Client, LocalCluster

In [2]:
# You can toggle processes vs. threads per workload characteristics.  :contentReference[oaicite:2]{index=2}
cluster = LocalCluster(processes=True, n_workers=4, threads_per_worker=2)
client  = Client(cluster)
print(client)  # Optional: view dashboard URL

<Client: 'tcp://127.0.0.1:63481' processes=4 threads=8, memory=15.69 GiB>


In [3]:
DATASET_ROOT = r"kvasir-dataset-v2"
OUTPUT_DIR   = Path("outputs_dask"); OUTPUT_DIR.mkdir(exist_ok=True)
(OUTPUT_DIR / "quarantine").mkdir(exist_ok=True, parents=True)
VALID_EXTS   = (".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff")
random.seed(42); np.random.seed(42)

In [4]:
def safe_imread(path: str):
    """Robust image read (works with non-ASCII paths, returns None if corrupt)."""
    try:
        with open(path, "rb") as f:
            arr = np.frombuffer(f.read(), dtype=np.uint8)
        img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
        return img
    except Exception:
        return None

def image_entropy_gray(gray: np.ndarray) -> float:
    """Shannon entropy on grayscale histogram."""
    hist = cv2.calcHist([gray], [0], None, [256], [0, 256]).ravel()
    p = hist / (gray.size + 1e-12)
    p = p[p > 0]
    return float(-(p * np.log2(p)).sum())

def hue_circular_mean(h_channel: np.ndarray) -> float:
    """Circular mean for OpenCV hue (0..179) as per color wheel."""
    angles = h_channel.astype(np.float32) * (2*np.pi / 180.0)
    C, S = np.cos(angles).mean(), np.sin(angles).mean()
    ang = math.atan2(S, C)
    if ang < 0: ang += 2*np.pi
    return float(ang * (180.0 / (2*np.pi)))

def extract_features_one(path: str, label: str):
    """
    Single-image feature extraction: brightness, contrast, saturation,
    hue (circular), entropy, edge_density + dims.
    Returns dict or None for corrupt images.
    """
    img = safe_imread(path)
    if img is None:
        # Move corrupt file for audit; keep going (distributed robustness)
        try:
            shutil.copy2(path, OUTPUT_DIR / "quarantine" / Path(path).name)
        except Exception:
            pass
        return None

    hgt, wdt = img.shape[:2]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hsv  = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    H, S, V = cv2.split(hsv)

    # Brightness/contrast
    brightness = float(gray.mean())
    contrast   = float(gray.std())

    # Saturation & Hue (circular mean per your plan)
    saturation = float(S.mean())
    hue_circ   = hue_circular_mean(H)

    # Entropy (texture/complexity)
    entropy    = image_entropy_gray(gray)

    # Edge density (Canny; thresholds from luminance median)
    med = float(np.median(V))
    low, high = int(max(0, 0.66*med)), int(min(255, 1.33*med))
    edges = cv2.Canny(gray, low, high, L2gradient=True)
    edge_density = float(edges.mean() / 255.0)

    return {
        "filepath": path,
        "label": label,
        "height": int(hgt),
        "width":  int(wdt),
        "brightness": brightness,
        "contrast":   contrast,
        "saturation": saturation,
        "hue_circular": hue_circ,
        "entropy": entropy,
        "edge_density": edge_density,
    }


In [5]:
def enumerate_files(root: str):
    """Collect (path,label) pairs for class-per-folder layout."""
    pairs = []
    for cls_dir in Path(root).iterdir():
        if cls_dir.is_dir():
            label = cls_dir.name
            for fp in cls_dir.rglob("*"):
                if fp.is_file() and fp.suffix.lower() in VALID_EXTS:
                    pairs.append((str(fp), label))
    return pairs

pairs = enumerate_files(DATASET_ROOT)
print(f"Found {len(pairs)} candidate files.")

# Build delayed tasks (lazy graph) ‚Äî per lecture on dask.delayed :contentReference[oaicite:3]{index=3}
delayed_rows = [delayed(extract_features_one)(p, l) for p, l in pairs]

# Compute in parallel (use processes by default; can override scheduler) :contentReference[oaicite:4]{index=4}
results = dask.compute(*delayed_rows)    # equivalent: compute(..., scheduler='processes')
rows    = [r for r in results if r is not None]

# Convert to Dask DataFrame (partitions = CPU count by default)
pdf = pd.DataFrame(rows)
if pdf.empty:
    raise SystemExit("No valid images processed. Check DATASET_ROOT.")
ddf = dd.from_pandas(pdf, npartitions=os.cpu_count())


Found 8000 candidate files.


In [6]:
# (Lectures recommend Parquet for speed; CSV kept for compatibility) :contentReference[oaicite:5]{index=5}
FEATURES = ["brightness","contrast","saturation","hue_circular","entropy","edge_density"]

# Min‚ÄìMax scaling on Dask DataFrame
mins = ddf[FEATURES].min().compute()
maxs = ddf[FEATURES].max().compute()
for c in FEATURES:
    ddf[c+"_minmax"] = (ddf[c] - mins[c]) / (maxs[c] - mins[c])

# Single-file CSV and Parquet
csv_path = OUTPUT_DIR / "kvasir_features_dask.csv"
parq_dir = OUTPUT_DIR / "kvasir_features_parquet"
ddf.to_csv(str(csv_path), single_file=True, index=False)
ddf.to_parquet(str(parq_dir), engine="pyarrow", overwrite=True)


In [7]:
# ================================================================
# Kvasir Dataset - Final EDA Visualization Script
# ================================================================

# Set improved styling
sns.set_theme(style="whitegrid", palette="Set2")
plt.rcParams.update({
    'figure.dpi': 120,
    'axes.labelsize': 12,
    'axes.titlesize': 14,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10
})

# Load data from Dask DataFrame (compute to pandas)
df = ddf.compute()
classes = sorted(df['label'].unique())

# ----------------------------------------------------------------
# 1Ô∏è‚É£ Class Distribution (with annotations)
# ----------------------------------------------------------------
plt.figure(figsize=(10, 5))
ax = sns.countplot(x='label', data=df, order=classes, palette="viridis")
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x()+p.get_width()/2., p.get_height()),
                ha='center', va='bottom', fontsize=9)
plt.title("Class Distribution (8 Classes)")
plt.xlabel("Class")
plt.ylabel("Image Count")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "plot_class_distribution.png", dpi=150, bbox_inches='tight')
plt.close()

# ----------------------------------------------------------------
# 2Ô∏è‚É£ Image Dimensions Histograms
# ----------------------------------------------------------------
for col in ["width", "height"]:
    col_vals = df[col]
    plt.figure(figsize=(10, 5))
    plt.hist(col_vals, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
    plt.title(f"Image {col.capitalize()} Distribution")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / f"hist_{col}.png", dpi=150, bbox_inches='tight')
    plt.close()

# ----------------------------------------------------------------
# 3Ô∏è‚É£ Feature Correlation Heatmap (improved)
# ----------------------------------------------------------------
corr = df[FEATURES].corr()
plt.figure(figsize=(10, 7))
sns.heatmap(corr, annot=True, cmap="RdYlBu_r", fmt=".2f", linewidths=0.3, 
            square=True, cbar_kws={"shrink": 0.8})
plt.title("Feature Correlation Heatmap", fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "plot_feature_correlation.png", dpi=150, bbox_inches='tight')
plt.close()

# ----------------------------------------------------------------
# 4Ô∏è‚É£ Box Plots for key features
# ----------------------------------------------------------------
def box_plot(feature, ylabel):
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='label', y=feature, data=df, order=classes,
                palette="pastel", showfliers=True)
    plt.title(f"{feature.capitalize()} by Class", fontsize=14, fontweight='bold')
    plt.xlabel("Class")
    plt.ylabel(ylabel)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / f"box_{feature}.png", dpi=150, bbox_inches='tight')
    plt.close()

for f in FEATURES:
    box_plot(f, f)

# ----------------------------------------------------------------
# 5Ô∏è‚É£ Violin plots for distributions (brightness, contrast)
# ----------------------------------------------------------------
for f in ["brightness", "contrast"]:
    plt.figure(figsize=(10, 6))
    sns.violinplot(x='label', y=f, data=df, order=classes,
                   palette="Set2", inner='box', scale='width')
    plt.title(f"{f.capitalize()} Distribution by Class (Violin Plot)", 
              fontsize=14, fontweight='bold')
    plt.xlabel("Class")
    plt.ylabel(f)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / f"violin_{f}.png", dpi=150, bbox_inches='tight')
    plt.close()

# ----------------------------------------------------------------
# 6Ô∏è‚É£ Hue Circular - Density plot (KDE) by class
# ----------------------------------------------------------------
plt.figure(figsize=(12, 6))
for label in classes:
    data = df[df['label'] == label]['hue_circular']
    sns.kdeplot(data=data, label=label, alpha=0.6, linewidth=2)
plt.title("Hue Circular Distribution by Class (KDE)", fontsize=14, fontweight='bold')
plt.xlabel("Hue Circular")
plt.ylabel("Density")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "kde_hue_circular.png", dpi=150, bbox_inches='tight')
plt.close()

# ----------------------------------------------------------------
# 7Ô∏è‚É£ Strip / Swarm plots for fine-grained view
# ----------------------------------------------------------------
# Entropy - Strip plot
plt.figure(figsize=(12, 6))
sns.stripplot(x='label', y='entropy', data=df.sample(frac=0.3, random_state=42),
              order=classes, alpha=0.4, jitter=0.25, s=3, color='steelblue')
plt.title("Entropy by Class (Strip Plot)", fontsize=14, fontweight='bold')
plt.xlabel("Class")
plt.ylabel("Entropy")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "strip_entropy.png", dpi=150, bbox_inches='tight')
plt.close()

# Edge Density - Swarm plot
plt.figure(figsize=(12, 6))
sns.swarmplot(x='label', y='edge_density', data=df.sample(frac=0.2, random_state=42),
              order=classes, alpha=0.6, size=3, color='teal')
plt.title("Edge Density by Class (Swarm Plot)", fontsize=14, fontweight='bold')
plt.xlabel("Class")
plt.ylabel("Edge Density")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "swarm_edge_density.png", dpi=150, bbox_inches='tight')
plt.close()

# ----------------------------------------------------------------
# 8Ô∏è‚É£ Mean ¬± Std Bar plot (Saturation)
# ----------------------------------------------------------------
agg = df.groupby('label')['saturation'].agg(['mean', 'std']).reset_index()
plt.figure(figsize=(12, 6))
plt.bar(agg['label'], agg['mean'], yerr=agg['std'], capsize=5,
        color='coral', edgecolor='black', alpha=0.7)
plt.title("Mean Saturation by Class (Mean ¬± SD)", fontsize=14, fontweight='bold')
plt.xlabel("Class")
plt.ylabel("Saturation (Mean ¬± SD)")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "bar_saturation_mean_std.png", dpi=150, bbox_inches='tight')
plt.close()

# ----------------------------------------------------------------
# 9Ô∏è‚É£ Scatter / Pairwise relationship (improved)
# ----------------------------------------------------------------
sampled_df = df.sample(frac=0.25, random_state=42)
g = sns.pairplot(sampled_df[FEATURES], diag_kind="hist", corner=True,
                 plot_kws={'alpha': 0.5, 's': 10}, 
                 diag_kws={'alpha': 0.7, 'bins': 30})
g.fig.suptitle("Feature Scatter Matrix (25% Sample)", y=1.02, fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "scatter_matrix.png", dpi=150, bbox_inches='tight')
plt.close()

# ----------------------------------------------------------------
# üîü Summary Table for report
# ----------------------------------------------------------------
summary = df.groupby('label')[FEATURES].agg(['mean', 'std']).round(2)
summary.to_csv(OUTPUT_DIR / "feature_summary_table.csv")
print("EDA visualizations & summary table exported successfully.")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.countplot(x='label', data=df, order=classes, palette="viridis")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='label', y=feature, data=df, order=classes,



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='label', y=feature, data=df, order=classes,



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='label', y=feature, data=df, order=classes,



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='label', y=feature, data=df, order=classes,



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='label', y=feature, data=df, order=classes,



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='label', y=feature, data=df, order=classes,



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='label', y=f, data=df, order=classes,

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(x='label', y=f, data=df, order=classes,



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='label', y=f, data=df, order=classes,

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(x='label', y=f, data=df, order=classes,


















EDA visualizations & summary table exported successfully.


In [8]:
# Compute class counts for summary
class_counts = ddf["label"].value_counts().compute()

summary = {
    "total_images_scanned": int(len(pairs)),
    "valid_images": int(ddf.shape[0].compute()),
    "classes": {k: int(v) for k,v in class_counts.to_dict().items()},
    "feature_summary": ddf[FEATURES].describe().compute().to_dict(),
}
with open(OUTPUT_DIR / "qc_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

html = f"""
<html><head><meta charset="utf-8"><title>Checkpoint 3: Dask EDA (Kvasir)</title>
<style>
    body {{ font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }}
    h1 {{ color: #2c3e50; }}
    h2 {{ color: #34495e; border-bottom: 2px solid #3498db; padding-bottom: 5px; }}
    h3 {{ color: #7f8c8d; margin-top: 20px; }}
    ul {{ line-height: 1.8; }}
    img {{ margin: 10px 0; border: 1px solid #ddd; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
</style>
</head>
<body>
<h1>Distributed EDA with Dask ‚Äî Kvasir Dataset</h1>
<p><strong>Cluster:</strong> {client}</p>
<ul>
  <li><strong>Total files scanned:</strong> {summary['total_images_scanned']}</li>
  <li><strong>Valid images processed:</strong> {summary['valid_images']}</li>
  <li><strong>Number of classes:</strong> {len(summary['classes'])}</li>
  <li><strong>Classes:</strong> {', '.join(summary['classes'].keys())}</li>
</ul>

<h2>üìä Dataset Overview</h2>
<h3>1. Class Distribution</h3>
<img src="plot_class_distribution.png" width="900" alt="Class Distribution">

<h3>2. Image Dimensions</h3>
<img src="hist_width.png" width="900" alt="Image Width Distribution">
<img src="hist_height.png" width="900" alt="Image Height Distribution">

<h2>üìà Feature Analysis by Class</h2>
<h3>3. Feature Correlation Heatmap</h3>
<img src="plot_feature_correlation.png" width="900" alt="Feature Correlation">

<h3>4. Box Plots (All Features)</h3>
<img src="box_brightness.png" width="900" alt="Brightness Box Plot">
<img src="box_contrast.png" width="900" alt="Contrast Box Plot">
<img src="box_saturation.png" width="900" alt="Saturation Box Plot">
<img src="box_hue_circular.png" width="900" alt="Hue Circular Box Plot">
<img src="box_entropy.png" width="900" alt="Entropy Box Plot">
<img src="box_edge_density.png" width="900" alt="Edge Density Box Plot">

<h3>5. Violin Plots (Distribution Shapes)</h3>
<img src="violin_brightness.png" width="900" alt="Brightness Violin Plot">
<img src="violin_contrast.png" width="900" alt="Contrast Violin Plot">

<h3>6. Statistical Summaries</h3>
<img src="bar_saturation_mean_std.png" width="900" alt="Saturation Mean and Std">
<img src="kde_hue_circular.png" width="900" alt="Hue Circular KDE">

<h3>7. Fine-Grained Visualizations</h3>
<img src="strip_entropy.png" width="900" alt="Entropy Strip Plot">
<img src="swarm_edge_density.png" width="900" alt="Edge Density Swarm Plot">

<h2>üîó Feature Relationships</h2>
<h3>8. Scatter Matrix</h3>
<img src="scatter_matrix.png" width="1000" alt="Feature Scatter Matrix">

<h2>üìã Summary</h2>
<ul>
  <li><strong>Features extracted:</strong> brightness, contrast, saturation, hue (circular), entropy, edge density</li>
  <li><strong>Visualization types:</strong> 
    <ul>
      <li>Count plot with annotations (class distribution)</li>
      <li>Histograms (dimensions)</li>
      <li>Heatmap (correlation matrix)</li>
      <li>Box plots (all features)</li>
      <li>Violin plots (brightness, contrast)</li>
      <li>Bar plots with error bars (saturation)</li>
      <li>KDE density plots (hue)</li>
      <li>Strip plots (entropy)</li>
      <li>Swarm plots (edge density)</li>
      <li>Pair plot (scatter matrix)</li>
    </ul>
  </li>
  <li><strong>Data exports:</strong>
    <ul>
      <li>CSV: kvasir_features_dask.csv</li>
      <li>Parquet: kvasir_features_parquet/ (8 partitions)</li>
      <li>Summary table: feature_summary_table.csv</li>
      <li>Quality control: qc_summary.json</li>
    </ul>
  </li>
  <li><strong>Min‚ÄìMax scaled features:</strong> All features have *_minmax scaled versions for dashboards</li>
</ul>

<p><em>Report generated using Dask for distributed processing</em></p>
</body></html>
"""
(OUTPUT_DIR / "eda_report.html").write_text(html, encoding="utf-8")

print("\n" + "="*60)
print("DONE. All outputs exported to:", OUTPUT_DIR.resolve())
print("="*60)
print("\nüìÅ Data Files:")
print("  - kvasir_features_dask.csv")
print("  - kvasir_features_parquet/")
print("  - feature_summary_table.csv")
print("  - qc_summary.json")
print("\nüìä Visualizations:")
print("  - plot_class_distribution.png")
print("  - hist_width.png, hist_height.png")
print("  - plot_feature_correlation.png")
print("  - box_*.png (6 files)")
print("  - violin_brightness.png, violin_contrast.png")
print("  - bar_saturation_mean_std.png")
print("  - kde_hue_circular.png")
print("  - strip_entropy.png")
print("  - swarm_edge_density.png")
print("  - scatter_matrix.png")
print("\nüìÑ Reports:")
print("  - eda_report.html")
print("\n" + "="*60)



DONE. All outputs exported to: C:\Users\User\Desktop\lums\3rd semester\Data Science Visualize\Project\outputs_dask

üìÅ Data Files:
  - kvasir_features_dask.csv
  - kvasir_features_parquet/
  - feature_summary_table.csv
  - qc_summary.json

üìä Visualizations:
  - plot_class_distribution.png
  - hist_width.png, hist_height.png
  - plot_feature_correlation.png
  - box_*.png (6 files)
  - violin_brightness.png, violin_contrast.png
  - bar_saturation_mean_std.png
  - kde_hue_circular.png
  - strip_entropy.png
  - swarm_edge_density.png
  - scatter_matrix.png

üìÑ Reports:
  - eda_report.html

