In [13]:
# Importing the lib
from PIL import Image
import os
import pandas as pd
import cv2
import numpy as np

# MetaData for Class 11th and Class 12th

In [11]:
def basic_metadata_analysis(image_folder) -> list:
    data = []
    for img_file in os.listdir(image_folder):
        if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
            img_path = os.path.join(image_folder, img_file)
            img = Image.open(img_path)
            width, height = img.size
            file_size = os.path.getsize(img_path) / 1024  # in KB
            data.append({'filename': img_file, 'width': width, 'height': height, 'file_size_kb': file_size})
    return data

In [3]:
Class_11_image_folder = r"C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images"
Class_12_image_folder = r"C:\Users\student\Desktop\202418003\Minor Project\Class 12\dataset\images"

In [None]:


Class_11_metadata = basic_metadata_analysis(image_folder=Class_11_image_folder)
Class_12_metadata = basic_metadata_analysis(image_folder=Class_12_image_folder)

In [17]:
pd.DataFrame(Class_11_metadata).to_csv("Class_11_metadata.csv")

In [18]:
pd.DataFrame(Class_12_metadata).to_csv("Class_12_metadata.csv")

# Text Presence & Density

In [24]:
!pip install pytesseract

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import os
from PIL import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

import pandas as pd

In [9]:
def extract_text_features_from_images(image_dir):
    data = []

    for filename in os.listdir(image_dir):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            file_path = os.path.join(image_dir, filename)
            try:
                img = Image.open(file_path)
                text = pytesseract.image_to_string(img)
                word_count = len(text.split())
                char_count = len(text)
                data.append({
                    'filename': filename,
                    'word_count': word_count,
                    'char_count': char_count
                })
                print(f"{file_path} done")
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue

    df = pd.DataFrame(data)
    return df


In [10]:
Class_11_text_presence = extract_text_features_from_images(image_dir=Class_11_image_folder)
Class_12_text_presence = extract_text_features_from_images(image_dir=Class_12_image_folder)

C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images\Figure 1.1.png done
C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images\Figure 10.1.png done
C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images\Figure 10.2a.png done
C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images\Figure 10.2b.png done
C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images\Figure 10.2c.png done
C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images\Figure 10.2d.png done
C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images\Figure 10.2e.png done
C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images\Figure 10.3.png done
C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images\Figure 10.4.png done
C:\Users\student\Desktop\202418003\Minor Project\Class 11\Dataset\Images\Figure 11.10.png done
C:\Users\student\Desktop\202418003\Minor Project\Class 

In [None]:
pd.DataFrame(Class_11_text_presence).to_csv("Class_11_text_presence.csv")

In [12]:
pd.DataFrame(Class_12_text_presence).to_csv("Class_12_text_presence.csv")

In [14]:
def is_grayscale(img):
    if len(img.shape) < 3 or img.shape[2] == 1:
        return True
    b, g, r = cv2.split(img)
    return np.array_equal(b, g) and np.array_equal(b, r)

def calculate_blurriness(img_gray):
    # Higher variance → sharper image
    return cv2.Laplacian(img_gray, cv2.CV_64F).var()

def calculate_brightness(img_gray):
    return np.mean(img_gray)

def calculate_contrast(img_gray):
    return img_gray.std()

def image_quality_analysis(image_dir):
    data = []

    for filename in os.listdir(image_dir):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            file_path = os.path.join(image_dir, filename)
            try:
                img = cv2.imread(file_path)
                if img is None:
                    continue

                img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

                blur_score = calculate_blurriness(img_gray)
                brightness = calculate_brightness(img_gray)
                contrast = calculate_contrast(img_gray)
                gray_flag = is_grayscale(img)

                data.append({
                    'filename': filename,
                    'laplacian_variance': round(blur_score, 2),
                    'brightness': round(brightness, 2),
                    'contrast': round(contrast, 2),
                    'is_grayscale': gray_flag
                })

            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue

    return pd.DataFrame(data)

In [16]:
image_quality_analysis(Class_11_image_folder).to_csv("Class_11_Image_Quality_Checks.csv")

In [17]:
image_quality_analysis(Class_12_image_folder).to_csv("Class_12_Image_Quality_Checks.csv")

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from PIL import Image
import pytesseract

def analyze_images(image_dir):
    data = []

    for filename in os.listdir(image_dir):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            file_path = os.path.join(image_dir, filename)
            try:
                # Basic Metadata
                pil_img = Image.open(file_path)
                width, height = pil_img.size
                file_size_kb = os.path.getsize(file_path) / 1024  # in KB

                # Text Features using OCR
                try:
                    text = pytesseract.image_to_string(pil_img)
                    word_count = len(text.split())
                    char_count = len(text)
                except Exception:
                    word_count = 0
                    char_count = 0

                # Image Quality Metrics using OpenCV
                img = cv2.imread(file_path)
                if img is None:
                    continue

                img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

                laplacian_variance = cv2.Laplacian(img_gray, cv2.CV_64F).var()
                brightness = np.mean(img_gray)
                contrast = img_gray.std()

                def is_grayscale(img):
                    if len(img.shape) < 3 or img.shape[2] == 1:
                        return True
                    b, g, r = cv2.split(img)
                    return np.array_equal(b, g) and np.array_equal(b, r)

                grayscale = is_grayscale(img)

                # Append all info
                data.append({
                    'filename': filename,
                    'width': width,
                    'height': height,
                    'file_size_kb': round(file_size_kb, 2),
                    'word_count': word_count,
                    'char_count': char_count,
                    'laplacian_variance': round(laplacian_variance, 2),
                    'brightness': round(brightness, 2),
                    'contrast': round(contrast, 2),
                    'is_grayscale': grayscale
                })

                print(f"Processed: {filename}")

            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue

    return pd.DataFrame(data)


In [5]:
df = analyze_images(Class_11_image_folder)
df.to_csv("Class_11_Image_Data.csv")

Processed: Class_11_Figure 1.1.png
Processed: Class_11_Figure 10.1.png
Processed: Class_11_Figure 10.2a.png
Processed: Class_11_Figure 10.2b.png
Processed: Class_11_Figure 10.2c.png
Processed: Class_11_Figure 10.2d.png
Processed: Class_11_Figure 10.2e.png
Processed: Class_11_Figure 10.3.png
Processed: Class_11_Figure 10.4.png
Processed: Class_11_Figure 11.10.png
Processed: Class_11_Figure 11.1a,b,c,d.png
Processed: Class_11_Figure 11.2.png
Processed: Class_11_Figure 11.3a.png
Processed: Class_11_Figure 11.3b.png
Processed: Class_11_Figure 11.3c.png
Processed: Class_11_Figure 11.4.png
Processed: Class_11_Figure 11.5.png
Processed: Class_11_Figure 11.6.png
Processed: Class_11_Figure 11.7.png
Processed: Class_11_Figure 11.8.png
Processed: Class_11_Figure 11.9.png
Processed: Class_11_Figure 12.1.png
Processed: Class_11_Figure 12.2.png
Processed: Class_11_Figure 12.3.png
Processed: Class_11_Figure 12.4.png
Processed: Class_11_Figure 12.5.png
Processed: Class_11_Figure 12.6.png
Processed: Cl