# üçà MusangKing ENHANCED Training
## High-Confidence Classification for 3 Durian Varieties

**Focus:** Musang King (D197), Black Thorn (D200), Udang Merah (D175)

**Improvements:**
- üéØ Data Augmentation (more training samples)
- üìä Feature Scaling & Normalization
- üîÑ Cross-Validation for robust accuracy
- ü§ñ Optimized Ensemble Model
- ‚öñÔ∏è Better Class Balancing (SMOTE + Class Weights)

## Step 1: Install Dependencies

In [None]:
!pip install roboflow opencv-python-headless scikit-learn imbalanced-learn xgboost tqdm albumentations -q
print("‚úÖ Dependencies installed!")

## Step 2: Download Datasets

In [None]:
from roboflow import Roboflow

API_KEY = "EPa1uqQkD6BSITYjgrP3"
rf = Roboflow(api_key=API_KEY)

print("üì• Downloading Dataset 1...")
project1 = rf.workspace("durian-cf87w").project("durian_own")
dataset1 = project1.version(1).download("yolov5pytorch")
print("‚úÖ Dataset 1 downloaded!\n")

print("üì• Downloading Dataset 2...")
project2 = rf.workspace("carl-bwzge").project("durian-thesis")
dataset2 = project2.version(3).download("yolov5pytorch")
print("‚úÖ Dataset 2 downloaded!")

## Step 3: Import Libraries

In [None]:
import cv2
import numpy as np
import pandas as pd
import os
import yaml
from tqdm import tqdm
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import xgboost as xgb

print("‚úÖ Libraries imported!")

## Step 4: K-Means Segmentation (SAME AS APP.PY)

In [None]:
def get_mask_lab_method(image):
    """K-Means Segmentation - IDENTICAL TO APP.PY (Gamma=0.6, K=3)"""
    img_resized = cv2.resize(image, (512, 512))
    gamma = 0.6
    table = np.array([((i / 255.0) ** (1.0/gamma)) * 255 for i in range(256)]).astype("uint8")
    img_gamma = cv2.LUT(img_resized, table)
    img_lab = cv2.cvtColor(img_gamma, cv2.COLOR_BGR2Lab)
    Z = img_lab.reshape((-1, 3)).astype(np.float32)
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    _, label, center = cv2.kmeans(Z, 3, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
    center = np.uint8(center)
    result_image = center[label.flatten()].reshape((img_lab.shape))
    h, w = result_image.shape[:2]
    center_color = result_image[h//2, w//2]
    mask = cv2.inRange(result_image, center_color, center_color)
    kernel = np.ones((5, 5), np.uint8)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        c = max(contours, key=cv2.contourArea)
        mask_clean = np.zeros_like(mask)
        cv2.drawContours(mask_clean, [c], -1, 255, -1)
        mask = mask_clean
    return mask

print("‚úÖ K-Means function (Gamma=0.6, K=3)")

## Step 5: ENHANCED Feature Extraction

In [None]:
def extract_features(image, mask):
    """ENHANCED feature extraction with more discriminative features"""
    img = cv2.resize(image, (512, 512))
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours: return None
    
    cnt = max(contours, key=cv2.contourArea)
    area = cv2.contourArea(cnt)
    perimeter = cv2.arcLength(cnt, True)
    if area == 0 or perimeter == 0: return None
    
    # GEOMETRIC FEATURES
    compactness = (perimeter ** 2) / area
    approx = cv2.approxPolyDP(cnt, 0.01 * perimeter, True)
    smoothness = len(approx) / perimeter
    x, y, w, h = cv2.boundingRect(cnt)
    aspect_ratio = float(w) / h if h > 0 else 0
    rectangularity = area / (w * h) if (w * h) > 0 else 0
    
    # Additional shape features
    hull = cv2.convexHull(cnt)
    hull_area = cv2.contourArea(hull)
    solidity = area / hull_area if hull_area > 0 else 0
    
    # Circularity
    circularity = (4 * np.pi * area) / (perimeter ** 2) if perimeter > 0 else 0
    
    # COLOR FEATURES
    mean_bgr = cv2.mean(img, mask=mask)
    mean_red = mean_bgr[2]
    mean_green = mean_bgr[1]
    mean_blue = mean_bgr[0]
    
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    mean_hsv = cv2.mean(hsv, mask=mask)
    mean_hue = mean_hsv[0]
    mean_sat = mean_hsv[1]
    mean_val = mean_hsv[2]
    
    # Color ratios (important for durian variety)
    rg_ratio = mean_red / mean_green if mean_green > 0 else 0
    rb_ratio = mean_red / mean_blue if mean_blue > 0 else 0
    
    return {
        # Original features (for app.py compatibility)
        'Compactness': compactness,
        'Smoothness': smoothness,
        'Aspect_Ratio': aspect_ratio,
        'Rectangularity': rectangularity,
        'Mean_Red': mean_red,
        'Mean_Hue': mean_hue,
        # Enhanced features
        'Solidity': solidity,
        'Circularity': circularity,
        'Mean_Saturation': mean_sat,
        'Mean_Value': mean_val,
        'RG_Ratio': rg_ratio,
        'RB_Ratio': rb_ratio
    }

print("‚úÖ Enhanced feature extraction (12 features)")

## Step 6: Data Augmentation

In [None]:
def augment_image(image):
    """Simple augmentation to increase training data"""
    augmented = [image]
    
    # Horizontal flip
    augmented.append(cv2.flip(image, 1))
    
    # Brightness variations
    for factor in [0.8, 1.2]:
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV).astype(np.float32)
        hsv[:,:,2] = np.clip(hsv[:,:,2] * factor, 0, 255)
        augmented.append(cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR))
    
    # Slight rotation
    h, w = image.shape[:2]
    for angle in [-10, 10]:
        M = cv2.getRotationMatrix2D((w/2, h/2), angle, 1)
        rotated = cv2.warpAffine(image, M, (w, h))
        augmented.append(rotated)
    
    return augmented

print("‚úÖ Augmentation function (6x data)")

## Step 7: Load and Process Dataset with Augmentation

In [None]:
# Variety mapping - STRICT 3 classes only
VARIETY_MAP = {
    'musang': 'D197_MusangKing',
    'd197': 'D197_MusangKing',
    'king': 'D197_MusangKing',
    'black': 'D200_BlackThorn',
    'd200': 'D200_BlackThorn',
    'thorn': 'D200_BlackThorn',
    'udang': 'D175_UdangMerah',
    'd175': 'D175_UdangMerah',
    'merah': 'D175_UdangMerah',
    'prawn': 'D175_UdangMerah'
}

def detect_variety(label_str):
    """Detect variety from any label string"""
    label_lower = label_str.lower()
    for key, variety in VARIETY_MAP.items():
        if key in label_lower:
            return variety
    return None  # Unknown - skip

def load_yolov5_dataset_with_augmentation(dataset_paths, use_augmentation=True):
    all_data = []
    variety_counts = {'D197_MusangKing': 0, 'D200_BlackThorn': 0, 'D175_UdangMerah': 0}
    
    for dataset_path in dataset_paths:
        print(f"\nüìÇ Processing: {dataset_path}")
        
        # Read class names from data.yaml
        yaml_path = os.path.join(dataset_path, "data.yaml")
        class_names = {}
        if os.path.exists(yaml_path):
            with open(yaml_path, 'r') as f:
                data = yaml.safe_load(f)
                names = data.get('names', {})
                if isinstance(names, list):
                    class_names = {i: n for i, n in enumerate(names)}
                else:
                    class_names = names
            print(f"  Classes: {class_names}")
        
        for split in ['train', 'valid', 'test']:
            images_dir = os.path.join(dataset_path, split, 'images')
            labels_dir = os.path.join(dataset_path, split, 'labels')
            
            if not os.path.exists(images_dir): continue
            
            images = [f for f in os.listdir(images_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            print(f"  üìÅ {split}: {len(images)} images")
            
            for img_file in tqdm(images, desc=f"{split}", leave=False):
                try:
                    img_path = os.path.join(images_dir, img_file)
                    image = cv2.imread(img_path)
                    if image is None: continue
                    
                    # Get class from label file
                    label_file = os.path.splitext(img_file)[0] + '.txt'
                    label_path = os.path.join(labels_dir, label_file)
                    
                    variety = None
                    if os.path.exists(label_path):
                        with open(label_path, 'r') as f:
                            lines = f.readlines()
                            if lines:
                                class_id = int(lines[0].split()[0])
                                class_name = class_names.get(class_id, '')
                                variety = detect_variety(class_name)
                    
                    # Skip if not one of our 3 varieties
                    if variety is None:
                        continue
                    
                    # Apply augmentation or use original
                    images_to_process = augment_image(image) if use_augmentation else [image]
                    
                    for aug_img in images_to_process:
                        mask = get_mask_lab_method(aug_img)
                        if cv2.countNonZero(mask) < (512 * 512 * 0.01): continue
                        
                        features = extract_features(aug_img, mask)
                        if features:
                            features['Variety'] = variety
                            all_data.append(features)
                            variety_counts[variety] += 1
                            
                except Exception as e:
                    continue
    
    print(f"\nüìä Variety Distribution:")
    for v, c in variety_counts.items():
        print(f"   {v}: {c}")
    
    return pd.DataFrame(all_data)

# Find dataset paths
DATASET_PATHS = [d for d in os.listdir('.') if 'durian' in d.lower() and os.path.isdir(d)]
print(f"Found datasets: {DATASET_PATHS}")

# Load with augmentation
df = load_yolov5_dataset_with_augmentation(DATASET_PATHS, use_augmentation=True)
print(f"\n‚úÖ Total samples: {len(df)}")
print(df['Variety'].value_counts())

## Step 8: Train HIGH-CONFIDENCE Variety Model

In [None]:
print("="*60)
print("TRAINING HIGH-CONFIDENCE VARIETY MODEL")
print("="*60)

# Use original app.py compatible features
VARIETY_FEATURES = ['Compactness', 'Smoothness', 'Aspect_Ratio', 'Rectangularity', 'Mean_Red']

X = df[VARIETY_FEATURES]
y = df['Variety']

variety_encoder = LabelEncoder()
y_encoded = variety_encoder.fit_transform(y)
print(f"\nClasses: {variety_encoder.classes_}")
print(f"Samples per class: {pd.Series(y_encoded).value_counts().to_dict()}")

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"After SMOTE: {pd.Series(y_train_smote).value_counts().to_dict()}")

# Optimized Ensemble
print("\nüîß Training Optimized Ensemble...")

# Individual models with optimized parameters
svm = SVC(
    probability=True, 
    kernel='rbf', 
    C=10.0,  # Higher regularization
    gamma='scale',
    random_state=42
)

rf = RandomForestClassifier(
    n_estimators=200,  # More trees
    max_depth=15,
    min_samples_split=5,
    class_weight='balanced',
    random_state=42
)

xgb_clf = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss'
)

gb = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)

variety_model = VotingClassifier(
    estimators=[
        ('svm', svm),
        ('rf', rf),
        ('xgb', xgb_clf),
        ('gb', gb)
    ],
    voting='soft',
    weights=[1, 1.5, 1.5, 1]  # Weight RF and XGB slightly higher
)

variety_model.fit(X_train_smote, y_train_smote)

# Cross-validation
print("\nüìä Cross-Validation (5-fold):")
cv_scores = cross_val_score(variety_model, X_train_smote, y_train_smote, cv=5, scoring='accuracy')
print(f"   CV Scores: {cv_scores}")
print(f"   Mean CV Accuracy: {cv_scores.mean()*100:.2f}% (+/- {cv_scores.std()*2*100:.2f}%)")

# Test set evaluation
y_pred = variety_model.predict(X_test)
y_pred_proba = variety_model.predict_proba(X_test)

print(f"\nüéØ Test Set Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=variety_encoder.classes_))

# Confidence analysis
confidences = np.max(y_pred_proba, axis=1)
print(f"\nüìà Confidence Statistics:")
print(f"   Mean: {confidences.mean()*100:.1f}%")
print(f"   Min: {confidences.min()*100:.1f}%")
print(f"   Max: {confidences.max()*100:.1f}%")
print(f"   Samples > 80%: {(confidences > 0.8).sum()} / {len(confidences)}")

## Step 9: Train Ripeness Model

In [None]:
print("="*60)
print("TRAINING RIPENESS MODEL")
print("="*60)

# For ripeness, we'll use a default since Roboflow doesn't have ripeness labels
# We'll train on simulated data based on color features

RIPENESS_FEATURES = ['Mean_Hue', 'Compactness', 'Smoothness']

# Create synthetic ripeness labels based on color (simple heuristic)
# Mature durian: lower hue (brownish)
# Defective: higher saturation variance
df['Ripeness'] = df.apply(
    lambda row: 'defective' if row['Mean_Saturation'] < 50 or row['Compactness'] > 80 
                else 'mature', axis=1
)

X_ripe = df[RIPENESS_FEATURES]
y_ripe = df['Ripeness']

ripeness_encoder = LabelEncoder()
y_ripe_encoded = ripeness_encoder.fit_transform(y_ripe)
print(f"Classes: {ripeness_encoder.classes_}")
print(f"Distribution: {pd.Series(y_ripe_encoded).value_counts().to_dict()}")

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_ripe, y_ripe_encoded, test_size=0.2, random_state=42, stratify=y_ripe_encoded
)

smote_r = SMOTE(random_state=42)
X_train_r_smote, y_train_r_smote = smote_r.fit_resample(X_train_r, y_train_r)

ripeness_model = VotingClassifier(
    estimators=[
        ('svm', SVC(probability=True, random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='mlogloss'))
    ],
    voting='soft'
)

ripeness_model.fit(X_train_r_smote, y_train_r_smote)

y_pred_r = ripeness_model.predict(X_test_r)
print(f"\nüéØ Ripeness Accuracy: {accuracy_score(y_test_r, y_pred_r)*100:.2f}%")
print(classification_report(y_test_r, y_pred_r, target_names=ripeness_encoder.classes_))

## Step 10: Save Models

In [None]:
OUTPUT = "TRAINING_MODEL"
os.makedirs(OUTPUT, exist_ok=True)

joblib.dump(variety_model, f"{OUTPUT}/variety_model.pkl")
joblib.dump(variety_encoder, f"{OUTPUT}/variety_model_encoder.pkl")
joblib.dump(ripeness_model, f"{OUTPUT}/ripeness_model.pkl")
joblib.dump(ripeness_encoder, f"{OUTPUT}/ripeness_model_encoder.pkl")
df.to_csv(f"{OUTPUT}/features.csv", index=False)

print("‚úÖ Models saved!")
!ls -la {OUTPUT}

In [None]:
!zip -r trained_models_v2.zip TRAINING_MODEL/

from google.colab import files
files.download('trained_models_v2.zip')

print("\n" + "="*60)
print("üéâ ENHANCED TRAINING COMPLETE!")
print("="*60)
print("\nImprovements in this version:")
print("‚úÖ Data augmentation (6x more samples)")
print("‚úÖ 4-model ensemble (SVM, RF, XGB, GB)")
print("‚úÖ Optimized hyperparameters")
print("‚úÖ Cross-validation verified")
print("\nNext: Extract ZIP, copy .pkl to 'TRAINING MODEL', restart Flask")