In [1]:
import os
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt

In [2]:
data_dir = "D:\MED_LEAF_ID\data"

In [3]:
data = []

In [4]:
for folder_name in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder_name)
    if os.path.isdir(folder_path):  # Ensure it's a folder
        for image_name in os.listdir(folder_path):
            image_path = os.path.join(folder_path, image_name)
            data.append({"image_path": image_path, "label": folder_name})


In [5]:
df = pd.DataFrame(data)
print("Dataset Overview:")
print(df.head())
print(f"Total Images: {len(df)}")
print(f"Total Classes: {df['label'].nunique()}")

Dataset Overview:
                           image_path label
0   D:\MED_LEAF_ID\data\cnn\augmented   cnn
1    D:\MED_LEAF_ID\data\cnn\original   cnn
2  D:\MED_LEAF_ID\data\glcm\augmented  glcm
3   D:\MED_LEAF_ID\data\glcm\original  glcm
Total Images: 4
Total Classes: 2


In [6]:
print("\nClass Distribution:")
print(df['label'].value_counts())


Class Distribution:
label
cnn     2
glcm    2
Name: count, dtype: int64


In [None]:
import random

# Display random images
for i in range(5):
    random_row = df.sample(1).iloc[0]
    img = Image.open(random_row['image_path'])
    plt.imshow(img)
    plt.title(f"Class: {random_row['label']}")
    plt.axis("off")
    plt.show()

In [None]:
# Collect image properties
dimensions = []
for img_path in df['image_path']:
    with Image.open(img_path) as img:
        dimensions.append(img.size)

# Analyze properties
dim_df = pd.DataFrame(dimensions, columns=["Width", "Height"])
print("Image Properties Summary:")
print(dim_df.describe())


In [None]:
# Check for class imbalance
import seaborn as sns

sns.countplot(data=df, y="label", order=df['label'].value_counts().index)
plt.title("Class Distribution")
plt.show()

# Check for corrupt images
corrupt_count = 0
for img_path in df['image_path']:
    try:
        with Image.open(img_path) as img:
            img.verify()  # Verify if the image is valid
    except:
        corrupt_count += 1
print(f"Corrupt images: {corrupt_count}")


In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame with 'image_path' and 'label' columns
# Example: df = pd.DataFrame({'image_path': [image1, image2, ...], 'label': [label1, label2, ...]})

# Count the number of samples for each class
class_distribution = df['label'].value_counts()

# Convert the distribution to string for better formatting
class_distribution_str = class_distribution.to_string()

# Print the entire distribution
print(class_distribution_str)

