## **Imports**


In [None]:
from sklearn.neural_network import MLPClassifier  # MLP is an NN
from sklearn import svm
import numpy as np
import argparse
#import imutils  # If you are unable to install this library, ask the TA; we only need this in extract_hsv_histogram.
import cv2
import os
import re
import random
import pytesseract
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from skimage.util import random_noise
from PIL import Image, ImageDraw, ImageFont
from arabic_reshaper import reshape
from bidi.algorithm import get_display
import pandas as pd
from openpyxl.utils import get_column_letter  # Add this line
from commonfunctions import *
import numpy as np
import unittest

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import cv2
import numpy as np
import matplotlib.pyplot as plt

pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' # (Windows Example)


target_img_size = (32, 32) # fix image size because classification algorithms THAT WE WILL USE HERE expect that
# We are going to fix the random seed to make our experiments reproducible 
# since some algorithms use pseudorandom generators
random_seed = 42  
random.seed(random_seed)
np.random.seed(random_seed)

## **Main Functions Overview**

- **Image Alignment**
  - Detects SIFT keypoints and descriptors in the input and reference images.
  - Matches them using the ratio test.
  - Uses RANSAC to estimate a homography.
  - Applies the homography to warp the input image so it lines up with the reference.
  - Returns the aligned image (or the original if not enough matches are found).

- **Extract Details**
  - Uses (x, y, w, h) coordinates to crop the aligned card into:
    - The name region
    - The code (ID) region
  - Returns these sub-images for downstream OCR or digit processing.

- **Save Student Name**
  - Ensures the output folder exists.
  - Writes the cropped name image to disk with a filename that includes the student ID.
  - Creates a persistent record usable for manual review or OCR.

- **Split and Save Digits**
  - Converts the code region to grayscale and applies Otsu thresholding.
  - Finds contours and filters out small noise.
  - Selects the largest seven contours (by area) and sorts them left-to-right.
  - Saves each detected digit crop into a per-student folder as individual image files.

- **save_split_digits**
  - Takes a list of digit images for a student.
  - Ensures a folder exists for each student (named by their ID).
  - Saves each digit image as `digit_0.jpg`, `digit_1.jpg`, ..., `digit_6.jpg` inside the student’s folder.
  - Used for batch saving when all digit crops are already extracted.

## **Noise Detection and Treatment**
- **Impulsive Noise (Median Filter)**
- **Random Noise (Gaussian Filter)**

In [None]:
def is_random_noise(img, threshold=0.01):
    """
    Detects if an image has random (Gaussian) noise.
    If noise is above threshold, applies Gaussian blurring.
    Returns the (possibly filtered) image and a boolean indicating if noise was detected and treated image.
    """
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Estimate noise using a simple method: standard deviation of pixel intensities
    stddev = np.std(img)
    normalized_stddev = stddev / 255.0  # Normalize to [0, 1]

    if normalized_stddev < threshold:
        return img, False  # No significant noise, return original

    # Apply Gaussian blur
    blurred_img = cv2.GaussianBlur(img, (7, 7), 0)
    #show_images([img,blurred_img], ["Original Image", "After Gaussian Blur"])
    return blurred_img, True

# gray_img = cv2.imread('ykismail_College-ID-Scanner_main_images/ID10.jpg', cv2.IMREAD_GRAYSCALE)
# noisy_img = random_noise(gray_img, mode='gaussian', mean=0.5)
# noisy_img = (noisy_img * 255).astype('uint8')  # Convert back to uint8 for OpenCV

# TreatedImg, noise_detected = is_random_noise(noisy_img)
# if noise_detected:
#     print("Random noise detected and treated!")
# else:
#     print("No random noise detected.")




In [None]:
def is_impulsive_noise(img, threshold=0.1, black_range=(0, 9), white_range=(246, 255)):
    """
    Detects if an image has impulsive (salt-and-pepper) noise.
    If noise is above threshold, applies median filtering with adaptive kernel size.
    Returns the (possibly filtered) image and a boolean indicating if noise was detected an treted image.
    """
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    total_pixels = img.size
    num_black = np.sum((img >= black_range[0]) & (img <= black_range[1]))
    num_white = np.sum((img >= white_range[0]) & (img <= white_range[1]))
    prop = (num_black + num_white) / total_pixels

    if prop < threshold:
        return img, False  # No significant noise, return original
    # Determine kernel size based on noise severity
    k = int(3 + prop * 10)
    if k % 2 == 0:
        k += 1
    k = min(max(k, 3), 9)
    filtered_img = cv2.medianBlur(img, k)
    #show_images([img,filtered_img], ["Original Image", "After median filter"])
    return filtered_img, True

# gray_img = cv2.imread('ykismail_College-ID-Scanner_main_images/ID10.jpg', cv2.IMREAD_GRAYSCALE)
# noisy_img = random_noise(gray_img, mode='s&p', amount=0.6)
# noisy_img = (noisy_img * 255).astype('uint8')  # Convert back to uint8 for OpenCV

# TreatedImg, noise_detected = is_impulsive_noise(noisy_img)
# if noise_detected:
#     print("Impulsive noise detected and treated!")
# else:
#     print("No impulsive noise detected.")


In [None]:
def align_images_sift(img_to_align, reference_path):
    img1 = img_to_align
    img2 = cv2.imread(reference_path)      # Train Image (The perfect template)
    
    # --- FIX: Check if img1 is already grayscale ---
    if len(img1.shape) == 3:
        gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
    else:
        gray1 = img1 # Already grayscale

    # Ref image is loaded from disk, usually BGR, but good to check
    if len(img2.shape) == 3:
        gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
    else:
        gray2 = img2

    sift = cv2.SIFT_create() 
    
    kp1, des1 = sift.detectAndCompute(gray1, None)
    kp2, des2 = sift.detectAndCompute(gray2, None)

    bf = cv2.BFMatcher()
    matches = bf.knnMatch(des1, des2, k=2)

    good_matches = []
    for m, n in matches:
        if m.distance < 0.75 * n.distance:
            good_matches.append(m)

    if len(good_matches) > 10:
        src_pts = np.float32([kp1[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
        dst_pts = np.float32([kp2[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)

        M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)

        h, w = img2.shape[:2]
        # Warp the original img1 (whether gray or color)
        aligned_img = cv2.warpPerspective(img1, M, (w, h))

        return aligned_img
    
    else:
        print(f"Not enough matches found: {len(good_matches)}/10")
        return img1
    


def extract_details(aligned_image):
    name_coords = (100, 205, 1200, 150)
    code_coords = (640, 404, 335, 110)
    
    nx, ny, nw, nh = name_coords
    cx, cy, cw, ch = code_coords
    
    name_contour = aligned_image[ny:ny+nh, nx:nx+nw]
    code_contour = aligned_image[cy:cy+ch, cx:cx+cw]
    
    return name_contour, code_contour


def save_student_name(student_id, name_img, output_folder="extracted_names"):
    # Create folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    # Construct filename: extracted_names/ID1_name.jpg
    filename = f"{output_folder}/{student_id}_name.jpg"
    
    # Save the image
    cv2.imwrite(filename, name_img)
    

def split_and_save_digits(student_id, code_roi, output_folder="extracted_digits"):
    save_path = f"{output_folder}/ID{student_id}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        
    gray = cv2.cvtColor(code_roi, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # A. Collect all valid candidates
    candidates = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        area = w * h
        # Filter tiny noise
        if h > 15 and w > 5:
            candidates.append((x, y, w, h, area))
            
    # B. CRITICAL: Select exactly the top 7 by AREA (Size)
    # This removes small specks or the colon ":" if it was caught
    candidates = sorted(candidates, key=lambda c: c[4], reverse=True) # Sort largest first
    final_digits = candidates[:7] # Take top 7
    
    # C. Sort the final 7 by X-COORDINATE (Left -> Right)
    # This puts them back in the correct reading order (1, 2, 3...)
    final_digits = sorted(final_digits, key=lambda c: c[0])
        
    # D. Save
    for index, (x, y, w, h, area) in enumerate(final_digits):
        digit_img = code_roi[y:y+h, x:x+w]
        filename = f"{save_path}/digit_{index}.jpg"
        cv2.imwrite(filename, digit_img)

      
import cv2
import numpy as np

def extract_name_and_digits(aligned_image):
    """
    Input: An aligned ID card image.
    Output: 
      - name_roi: The image of the extracted name.
      - code_digits: A list of images for the code digits.
      - daf3_digits: A list of images for the daf3 digits.
    """
    
    # --- 1. Hardcoded Coordinates ---
    # x, y, w, h
    name_coords = (100, 205, 1200, 150)
    code_coords = (640, 404, 335, 110)
    daf3_coords = (350, 500, 620, 110)
    
    nx, ny, nw, nh = name_coords
    cx, cy, cw, ch = code_coords
    dx, dy, dw, dh = daf3_coords
    
    # Extract ROIs
    name_img = aligned_image[ny:ny+nh, nx:nx+nw]
    code_roi = aligned_image[cy:cy+ch, cx:cx+cw]
    daf3_img = aligned_image[dy:dy+dh, dx:dx+dw]
    
    # --- Helper Function to Process Any ROI ---
    def process_roi_digits(roi_img, digit_limit):
        """
        Applies grayscale, thresholding, contour detection, 
        splitting of merged digits, and sorting.
        """
        #gray = cv2.cvtColor(roi_img, cv2.COLOR_BGR2GRAY)

        if len(roi_img.shape) == 3:
            gray = cv2.cvtColor(roi_img, cv2.COLOR_BGR2GRAY)
        else:
            gray = roi_img
        # --- FIX ENDS HERE --
        
        # Binary Inverse + Otsu
        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
        
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        candidates = []
        for cnt in contours:
            x, y, w, h = cv2.boundingRect(cnt)
            area = w * h
            
            # Filter tiny noise 
            if h > 15 and w > 5:
                
                # --- CHECK FOR MERGED DIGITS ---
                # If width > 0.8 * height, likely two digits stuck together
                if w > 0.8 * h: 
                    half_w = w // 2
                    # Digit 1 (Left half)
                    candidates.append((x, y, half_w, h, half_w * h))
                    # Digit 2 (Right half)
                    candidates.append((x + half_w, y, half_w, h, half_w * h))
                else:
                    # Normal single digit
                    candidates.append((x, y, w, h, area))
        
        # 1. Sort by Area Descending (Keep only the largest objects to remove noise)
        candidates = sorted(candidates, key=lambda c: c[4], reverse=True)[:digit_limit]
        
        # 2. Sort by X-coordinate Ascending (Order them Left -> Right)
        final_candidates = sorted(candidates, key=lambda c: c[0])
        
        # Crop the actual images
        cropped_digits = []
        for (x, y, w, h, area) in final_candidates:
            digit_crop = roi_img[y:y+h, x:x+w]
            cropped_digits.append(digit_crop)
            
        return cropped_digits

    # --- 2. Process Regions ---
    
    # Detect Code (Limit 7 digits)
    code_digits = process_roi_digits(code_roi, digit_limit=7)
    
    # Detect Daf3 (Limit 14 digits)
    daf3_digits = process_roi_digits(daf3_img, digit_limit=14)

    return name_img, code_digits, daf3_digits

def save_split_digits(student_id, digit_imgs, output_folder="extracted_digits"):
    """
    Saves a list of digit images for a student in the same way as split_and_save_digits.
    Each digit is saved as digit_0.jpg, digit_1.jpg, ..., digit_6.jpg in a folder per student.
    """
    save_path = f"{output_folder}/{student_id}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    for index, digit_img in enumerate(digit_imgs):
        filename = f"{save_path}/digit_{index}.jpg"
        cv2.imwrite(filename, digit_img)    



## **SVM English Number Classifier**

In [None]:
path_to_train_dataset = r"train_digits"  # Training set

def train_SVM_robust():
    # 1. Map your specific filename prefixes to actual digits
    label_map = {
        'a': '0', 'b': '1', 'c': '2', 'd': '3', 'e': '4', 
        'f': '5', 'g': '6', 'h': '7', 'i': '8', 'j': '9'
    }
    
    features = []
    labels = []
    
    img_filenames = os.listdir(path_to_train_dataset)
    print(f"Loading {len(img_filenames)} training images...")

    for fn in img_filenames:
        if not fn.lower().endswith(('.jpg', '.png')):
            continue

        # Get the first letter (a, b, c...)
        prefix = fn[0].lower()
        if prefix in label_map:
            labels.append(label_map[prefix])
            
            path = os.path.join(path_to_train_dataset, fn)
            img = cv2.imread(path)
            
            # Extract HOG features (ensure preprocessing matches)
            features.append(extract_hog_features(img))
    
    # 2. Create a Pipeline: Scale Features -> Train SVM
    # Scaling is CRITICAL for HOG-based SVMs
    clf = Pipeline([
        ('scaler', StandardScaler()),
        ('svc', LinearSVC(random_state=42, max_iter=5000, dual=False))
    ])
    
    # 3. Train/Test Split for internal validation
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.2, random_state=random_seed
    )
    
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    print(f"Training Complete. Validation Accuracy: {accuracy*100:.2f}%")
    
    return clf

def extract_hog_features(img):
    # Ensure grayscale
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # APPLY IDENTICAL PREPROCESSING TO TRAINING AND TEST DATA
    # This turns both sets into "binary masks" to ignore lighting/shadows
    _, img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    img = cv2.resize(img, (32, 32)) # target_img_size
    
    win_size = (32, 32)
    cell_size = (8, 8)  # Slightly larger cells help ignore "noise/shadows"
    block_size = (16, 16)
    block_stride = (8, 8)
    nbins = 9
    
    hog = cv2.HOGDescriptor(win_size, block_size, block_stride, cell_size, nbins)
    h = hog.compute(img)
    return h.flatten()

# **Tesseract Arabic OCR**

## **Current Situation**

The project uses Tesseract OCR to extract Arabic names from scanned images. Initially, the extraction pipeline achieved only a **70% success rate**. This meant that about 30% of the images failed to yield any valid Arabic text, even though the images were visually clear and contained readable names.

## **Why Was the Success Rate Only 70%?**

- **Overprocessing:** The original code applied several preprocessing steps (scaling, thresholding, blurring, etc.) before running OCR. While these steps can help with noisy or low-contrast images, they often **destroy clean, high-contrast text**—especially for Arabic, where fine details matter.
- **Order of Operations:** The pipeline tried processed versions first, so if the original image was already optimal, it was never used for OCR.
- **PSM/OEM Settings:** The code tried a limited set of Tesseract Page Segmentation Modes (PSM) and OCR Engine Modes (OEM), which may not have been optimal for all images.
- **Text Cleaning:** The cleaning function was aggressive, but if Tesseract output was empty or too short, the result was discarded.

## **What Was Changed to Achieve 100% Success**

1. **Prioritize the Original Image:**  
   The new code always tries the original, unprocessed grayscale image first, with several PSM settings. This ensures that clean images are not degraded by unnecessary processing.

2. **Expanded Preprocessing (But Only If Needed):**  
   Only if the original image fails, the code tries padded and scaled versions, but never applies destructive thresholding or blurring unless absolutely necessary.

3. **Multiple PSM and OEM Combinations:**  
   For each image variant, the code tries several PSM (6, 7, 3, 13) and both OEM (3, 1) settings, maximizing the chance that Tesseract will interpret the layout correctly.

4. **Result Selection:**  
   All non-empty results are collected, and the **longest valid extraction** is chosen, which is usually the correct full name.

5. **Diagnostics:**  
   Additional debug and diagnostic code was used to confirm that the original image, with minimal processing, consistently yields the best results for this dataset.

# Reference

The old (70%) code is left in the notebook for comparison. The new approach, as described above, achieves **100% extraction success** on the current dataset by respecting the quality of the input images and leveraging Tesseract's flexibility.

In [None]:
def extractname(img_path):
    
    # --- HELPER: TEXT CLEANER ---
    def clean_text(raw_text):
        if not raw_text: return ""
        # Keep Arabic letters (0621-064A) and spaces
        cleaned = re.sub(r'[^\u0621-\u064A\s]', '', raw_text)
        cleaned = cleaned.replace('\n', ' ')
        cleaned = re.sub(r'\s+', ' ', cleaned).strip()
        return cleaned

    # --- LOAD IMAGE AS GRAYSCALE DIRECTLY ---
    img_gray = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    if img_gray is None: 
        return ""

    # Try multiple approaches and collect all results
    all_results = []
    
    # Preprocessing variants
    preprocessed_images = {
        'original': img_gray,
        'padded': cv2.copyMakeBorder(img_gray, 40, 40, 40, 40, cv2.BORDER_CONSTANT, value=255),
    }
    
    # Add scaled version
    h, w = img_gray.shape
    scaled = cv2.resize(img_gray, (w*2, h*2), interpolation=cv2.INTER_CUBIC)
    preprocessed_images['scaled_padded'] = cv2.copyMakeBorder(scaled, 40, 40, 40, 40, cv2.BORDER_CONSTANT, value=255)
    
    # PSM modes to try
    psm_modes = [6, 7, 3, 13]  # 13 = raw line
    
    for img_name, img in preprocessed_images.items():
        for psm in psm_modes:
            for oem in [3, 1]:  # Try both LSTM+Legacy and LSTM only
                try:
                    config = f"--oem {oem} --psm {psm}"
                    text = pytesseract.image_to_string(img, lang='ara', config=config)
                    cleaned = clean_text(text)
                    
                    if len(cleaned) > 2:
                        all_results.append((cleaned, len(cleaned), img_name, psm, oem))
                except:
                    continue
    
    # Return the longest valid result
    if all_results:
        all_results.sort(key=lambda x: x[1], reverse=True)
        return all_results[0][0]
    
    return ""

# folder_path = 'test_arabic_names_full'
# data = []
# if os.path.exists(folder_path):
#     print(f"Processing images in: {folder_path}...\n")
    
#     for filename in os.listdir(folder_path):
#         # Check if the file is an image
#         if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
#             full_path = os.path.join(folder_path, filename)
#             extracted_text = extractname(full_path)    
#             clean_text_result = extracted_text.strip()
#             data.append({'Filename': filename, 'Extracted Name': clean_text_result})
    
#     df = pd.DataFrame(data)
    
#     # ===== SUCCESS RATE CALCULATION =====
#     total_images = len(df)
#     successful_extractions = len(df[df['Extracted Name'] != ''])
#     failed_extractions = total_images - successful_extractions
#     success_rate = (successful_extractions / total_images) * 100 if total_images > 0 else 0
    
#     print(f"{'='*50}")
#     print(f"OCR EXTRACTION RESULTS")
#     print(f"{'='*50}")
#     print(f"Total Images Processed: {total_images}")
#     print(f"Successful Extractions: {successful_extractions}")
#     print(f"Failed Extractions:     {failed_extractions}")
#     print(f"Success Rate:           {success_rate:.2f}%")
#     print(f"{'='*50}\n")
    
#     # Show failed images
#     if failed_extractions > 0:
#         failed_df = df[df['Extracted Name'] == '']
#         print("Failed to extract text from:")
#         for idx, row in failed_df.iterrows():
#             print(f"  - {row['Filename']}")
#         print()
    
#     display(df.head(50))
    
# else:
#     print(f"the folder '{folder_path}' was not found")

# **Main Pipeline** 

In [None]:
import os
import pandas as pd
import cv2
# specific import for Jupyter Notebooks
from IPython.display import display 

def main_pipeline():
    base_dir = os.getcwd()
    path_to_dataset = os.path.join(base_dir, 'Raw_IDs')
    refrence_image_path = os.path.join(base_dir, 'Raw_IDs', 'ID14.jpg')
    
    # Ensure the classifier is trained
    SVMclassifier = train_SVM_robust()
    
    data_for_excel = [] 

    # Safety check for directory
    if not os.path.exists(path_to_dataset):
        print(f"Directory not found: {path_to_dataset}")
        return

    for i in os.listdir(path_to_dataset):
        if not i.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            continue
            
        img_path = os.path.join(path_to_dataset, i)
        
        # --- Processing ---
        raw_img = cv2.imread(img_path)
        aligned_img = align_images_sift(raw_img, refrence_image_path)
        clean_img, is_impulsive = is_impulsive_noise(aligned_img)
        clean_img, is_random = is_random_noise(clean_img)
        name_img, digit_imgs, daf3_digits = extract_name_and_digits(clean_img)
        
        student_id = os.path.splitext(i)[0]
        
        # Save images
        save_student_name(student_id, name_img)
        save_split_digits(student_id, digit_imgs)
        save_split_digits(f"{student_id}_daf3", daf3_digits, output_folder="extracted_daf3_digits")
        
        # Predict Code
        digit_preds = []
        for digit_img in digit_imgs:
            feat = extract_hog_features(digit_img)
            pred = SVMclassifier.predict([feat])[0]
            digit_preds.append(str(pred))
        code_str = ''.join(digit_preds)
        
        # Predict Daf3
        daf3_preds = []
        for d_img in daf3_digits:
            feat = extract_hog_features(d_img)
            pred = SVMclassifier.predict([feat])[0]
            daf3_preds.append(str(pred))
        daf3_str = ''.join(daf3_preds)

        # Extract Name
        name_text = extractname(f'./extracted_names/{student_id}_name.jpg')
        
        # Add to list
        data_for_excel.append({
            "Student ID": student_id,
            "Name": name_text,
            "Code": code_str,
            "Daf3": daf3_str,
        })

    # --- OUTPUT SECTION ---
    if data_for_excel:
        df = pd.DataFrame(data_for_excel)
        df = df.sort_values(by="Student ID", key=lambda x: x.str.extract(r'(\d+)').iloc[:, 0].astype(int))

        # 1. DISPLAY TABLE IN JUPYTER
        print("Processing Complete. Results:")
        # This renders the DataFrame as a nice HTML table in the output cell
        # We filter to show only Name, Code, and Daf3 as requested
        display(df[['Student ID','Name', 'Code', 'Daf3']])
        
        # 2. SAVE TO EXCEL
        output_file = "Extracted_Results.xlsx"
        with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name='Sheet1')
            
            # Auto-resize columns
            worksheet = writer.sheets['Sheet1']
            for column in df:
                column_length = max(df[column].astype(str).map(len).max(), len(column))
                col_idx = df.columns.get_loc(column)
                col_letter = chr(65 + col_idx)
                worksheet.column_dimensions[col_letter].width = column_length + 2
                
        print(f"Excel file saved to: {output_file}")
    else:
        print("No data processed.")

# Run it
main_pipeline()