# Explanatory Analysis: Data Wrangling 

### 1. Load Packages

In [4]:
import numpy as np
import cv2 
import pandas as pd
import re
import os
import openpyxl
import time
from datetime import datetime
from PIL import Image
from sklearn.cluster import KMeans


### 2. Generate a Pandas Dataframe with Data from the File Name, Folder Structre with contain the following infomation
- full_path
- file_name
- class
- plant
- disease
- uuid
- source_code
- image_id
- augmentation

In [15]:

# Path
base_path = "/Users/felix/Documents/Data Science/06_Offical_project_DS/may25_bds_plants/05_data/data/2.1.1 New Plant Diseases/New Plant Diseases Dataset(Augmented)/New Plant Diseases Dataset(Augmented)/train"

# List to hold the data
data = []

# Split file name with regex
pattern = re.compile(r"(?P<uuid>[a-f0-9\-]+)___(?P<source>[A-Z_]+)\s(?P<id>\d+)_?(?P<aug>.*)?\.(jpg|jpeg|png)", re.IGNORECASE)

# Run through all files in the base path
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            full_path = os.path.join(root, file)
            rel_path = os.path.relpath(full_path, base_path)
            class_folder = rel_path.split(os.sep)[0]

            # Determine plant and disease from the class folder name
            if "___" in class_folder:
                plant, disease = class_folder.split("___")
            else:
                plant, disease = class_folder, "unknown"

            # Analyse the file name with regex
            match = pattern.match(file)
            if match:
                uuid = match.group("uuid")
                source_code = match.group("source")
                image_id = match.group("id")
                augmentation = match.group("aug") or "original"
            else:
                uuid = source_code = image_id = augmentation = "unknown"

            data.append({
                "file_path": full_path,
                "file_name": file,
                "class": class_folder,
                "plant": plant,
                "disease": disease,
                "uuid": uuid,
                "source_code": source_code,
                "image_id": image_id,
                "augmentation": augmentation.lower()
            })

# Safe the data to a DataFrame
df = pd.DataFrame(data)



In [None]:
#df.head()
#df.value_counts()
#df.describe()
#df.dtypes
#df.info



<bound method DataFrame.info of                                                file_path  \
0      /Users/felix/Documents/Data Science/06_Offical...   
1      /Users/felix/Documents/Data Science/06_Offical...   
2      /Users/felix/Documents/Data Science/06_Offical...   
3      /Users/felix/Documents/Data Science/06_Offical...   
4      /Users/felix/Documents/Data Science/06_Offical...   
...                                                  ...   
70290  /Users/felix/Documents/Data Science/06_Offical...   
70291  /Users/felix/Documents/Data Science/06_Offical...   
70292  /Users/felix/Documents/Data Science/06_Offical...   
70293  /Users/felix/Documents/Data Science/06_Offical...   
70294  /Users/felix/Documents/Data Science/06_Offical...   

                                               file_name  \
0      0370bc9b-c0c8-49b5-b999-c44323c45216___RS_HL 2...   
1      741e834f-a63a-4efd-b961-d5f7e047abdc___RS_HL 2...   
2      b8e9ed27-8e37-4214-9206-f8c0ef21cf4d___RS_HL 4...   
3      

### 3.1 Calculation of Grayscale Mean Brightness

In [17]:
def get_brightness(file_path):
    img = cv2.imread(file_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
    return np.mean(gray)  # Average brightness (0-255)

df['grayscale_brightness'] = df['file_path'].apply(get_brightness)

### 3.2 Calculation of Perceptual Brightness (Human-Vision Weighted)
A more human-aligned metric that weights RGB channels differently (similar to how our eyes perceive brightness):

In [18]:

def get_perceptual_brightness(file_path):
    """
    Calculate human-perceived brightness of an image (weighted RGB average).
    Returns a value between 0 (dark) and 255 (bright).
    
    Args:
        file_path (str): Path to the image file
        
    Returns:
        float: Perceptual brightness value
    """
    try:
        # Read image (ensure color order is RGB)
        img = cv2.imread(file_path)
        if img is None:
            raise ValueError(f"Could not read image: {file_path}")
            
        # Convert BGR to RGB (OpenCV uses BGR by default)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Calculate weighted sum (human eye sensitivity coefficients)
        # Coefficients: 0.299*R + 0.587*G + 0.114*B
        perceptual_brightness = np.dot(img_rgb, [0.299, 0.587, 0.114])
        
        return np.mean(perceptual_brightness)
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None  # or np.nan

# Apply to DataFrame
df['perceptual_brightness'] = df['file_path'].apply(get_perceptual_brightness)

### 4. Add Timestamp for Creation and Modification

In [19]:


df['creation_date'] = df['file_path'].apply(lambda x: datetime.fromtimestamp(os.path.getctime(x)))
df['modification_date'] = df['file_path'].apply(lambda x: datetime.fromtimestamp(os.path.getmtime(x)))
# Extract Year
df['modification_year'] = df['modification_date'].dt.year
# Extract Year-Month (as string, e.g. "2023-07")
df['modification_year_month'] = df['modification_date'].dt.strftime('%Y-%m')

### 5. Add Size of the Picture

In [20]:
def get_image_size(file_path):
    """Returns (width, height) for an image, handles errors gracefully."""
    try:
        with Image.open(file_path) as img:
            return img.size  # (width, height)
    except:
        return (None, None)  # or (np.nan, np.nan)

# Add dimensions (takes ~0.1s per image)
df[['width', 'height']] = pd.DataFrame(
    df['file_path'].apply(get_image_size).tolist(),
    index=df.index
)


df['total_pixels'] = df['width'] * df['height']
df['megapixels'] = (df['width'] * df['height'] / 1e6).round(2)

### 6. Add the Dominant Colors to the DataFrame

In [None]:
# TBD
    


### 7. Safe the DataFrame

In [21]:
# XLSX-Speicherung
output_path = "/Users/felix/Documents/Data Science/06_Offical_project_DS/may25_bds_plants/05_data/computed_data/plant_disease_dataset_analysis.xlsx"
df.to_excel(output_path, index=False, engine='openpyxl')

print(f"Datei gespeichert unter: {output_path}")


Datei gespeichert unter: /Users/felix/Documents/Data Science/06_Offical_project_DS/may25_bds_plants/05_data/computed_data/plant_disease_dataset_analysis.xlsx
