Phase 1 :

In [1]:
# == imports ==
import os
from glob import glob
import cv2
import numpy as np
import pandas as pd
from skimage import measure
from tqdm import tqdm
import matplotlib.pyplot as plt

# Display settings (optional)
%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 6)


In [2]:
def read_image(path, resize = None):
    img = cv2.imread(path)
    if img is None:
        raise ValueError(f"Can't read {path}")
    if resize is not None:
        img = cv2.resize(img, resize)
        
    return img


def segment_fruit_rgb(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)
    
    mask = (s > 30) & (v > 30)
    mask = mask.astype('uint8') * 255
    
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=1)
    
    contours, __ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return mask
    c = max(contours, key = cv2.contourArea)
    main_mask = np.zeros_like(mask)
    cv2.drawContours(main_mask, [c], -1, 255, -1)
    return main_mask

def extract_features_from_image(img, mask):
    mask_bool = mask.astype(bool)
    area = mask_bool.sum()
    ys, xs = np.where(mask_bool)

    if area == 0 or len(xs) == 0 or len(ys) == 0:
        return {
            'area': 0, 'perimeter': 0, 'circularity': 0,
            'mean_h': np.nan, 'mean_s': np.nan, 'mean_v': np.nan,
            'std_h': np.nan, 'std_s': np.nan, 'std_v': np.nan,
            'dark_ratio': np.nan, 'blemish_ratio': np.nan,
            'bbox_area': np.nan
        }
        
    h_img, w_img = mask.shape
    minx, maxx = np.clip(xs.min(), 0, w_img - 1), np.clip(xs.max(), 0, w_img - 1)
    miny, maxy = np.clip(ys.min(), 0, h_img - 1), np.clip(ys.max(), 0, h_img - 1)

    bbox_area = (maxx - minx + 1) * (maxy - miny + 1)

    contours, _ = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    perim = cv2.arcLength(contours[0], True) if contours else 0
    circularity = (4 * np.pi * area / (perim**2)) if perim > 0 else 0

    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    h, s, v = hsv[:,:,0], hsv[:,:,1], hsv[:,:,2]
    mean_h, mean_s, mean_v = h[mask_bool].mean(), s[mask_bool].mean(), v[mask_bool].mean()
    std_h, std_s, std_v = h[mask_bool].std(), s[mask_bool].std(), v[mask_bool].std()
    dark_ratio = (v[mask_bool] < 50).sum() / area

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    fruit_gray = gray[miny:maxy+1, minx:maxx+1]
    _, th = cv2.threshold(fruit_gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    blemish_mask = (fruit_gray < (th.mean())).astype(np.uint8) * 255
    blemish_ratio = (blemish_mask > 0).sum() / (fruit_gray.size + 1e-9)

    return {
        'area': int(area),
        'perimeter': float(perim),
        'circularity': float(circularity),
        'mean_h': float(mean_h), 'mean_s': float(mean_s), 'mean_v': float(mean_v),
        'std_h': float(std_h), 'std_s': float(std_s), 'std_v': float(std_v),
        'dark_ratio': float(dark_ratio),
        'blemish_ratio': float(blemish_ratio),
        'bbox_area': int(bbox_area)
    }


In [None]:
base_dir = "../data/Fruit-Images-Dataset/Training/"
if not os.path.exists(base_dir):
    print("Dataset not found. Please download it first (see README.md).")
else:
    print("Dataset found at:", base_dir)

rows = []

classes = sorted(os.listdir(base_dir))

for cls in classes:
    paths = glob(os.path.join(base_dir, cls, "*.jpg"))
    
    for p in tqdm(paths, desc=f"Processing {cls}", leave=False):
        try:
            img = read_image(p, resize=(200,200))
            mask = segment_fruit_rgb(img)
            feats = extract_features_from_image(img, mask)
            feats.update({'path': p, 'class': cls})
            rows.append(feats)
        except Exception as e:
            print("Error:", p, e)

df = pd.DataFrame(rows)
print("Extracted features for", len(df), "images")
df.head()


Phase 2:

In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
print("Shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())

df.describe()


In [None]:
plt.figure(figsize=(20,25))
sns.countplot(y="class", data=df, order=df['class'].value_counts().index)
plt.title("Number of Images per Fruit Type")
plt.show()


In [None]:
plt.Figure(figsize=(15, 30))
sns.barplot(data=df, x="mean_h", y="class", order=df.groupby('class')["mean_h"].mean().sort_values().index)
plt.title("Average Hue (Color Tone) per Fruit Type")
plt.xlabel("Mean Hue")
plt.ylabel("Fruit Class")
plt.show()

In [None]:
plt.figure(figsize=(50,6))
sns.boxplot(data=df, x="class", y="dark_ratio")
plt.xticks(rotation=90)
plt.title("Distribution of Dark Ratio per Fruit Type")
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.drop(columns=['path', 'class']).corr(), cmap='coolwarm', annot=True)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

X = df.drop(columns=['path','class'])
X = X.fillna(0)
X_scaled = StandardScaler().fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(15,15))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=df['class'], palette="tab10", s=50)
plt.title("PCA Visualization of Fruits based on Extracted Features")
plt.show()

Phase 3 :

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
test_dir = "../data/Fruit-Images-Dataset/Test/"

def build_feature_df(base_dir):
    rows = []
    classes = sorted(os.listdir(base_dir))
    for cls in classes:
        paths = glob(os.path.join(base_dir, cls, "*.jpg"))
        for p in tqdm(paths, desc=f"Processing {cls}", leave=False):
            try:
                img = read_image(p, resize=(200,200))
                mask = segment_fruit_rgb(img)
                feats = extract_features_from_image(img, mask)
                feats.update({'path': p, 'class': cls})
                rows.append(feats)
            except Exception as e:
                print("Error", p, e)
    return pd.DataFrame(rows)
df_test = build_feature_df(test_dir)

                                                                                  

In [None]:
X_train = df.drop(columns=['path', 'class']).fillna(0)
y_train = df['class']


X_test = df_test.drop(columns=['path', 'class']).fillna(0)
y_test = df_test['class']


le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train_encoded)

y_pred = rf.predict(X_test_scaled)

In [None]:
y_pred_labels = le.inverse_transform(y_pred)


acc = accuracy_score(y_test, y_pred_labels)
print("Accuracy:", round(acc*100, 2), "%\n")


print("Classification Report:")
print(classification_report(y_test, y_pred_labels, target_names=le.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred_labels, labels=le.classes_)

plt.figure(figsize=(40,30))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
feat_imp = pd.Series(rf.feature_importances_, index=X.columns)
feat_imp = feat_imp.sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=feat_imp[:10], y=feat_imp.index[:10])
plt.title("Top 10 Most Important Features for Fruit Classification")
plt.show()