# üçà MusangKing Aligned Training Pipeline
## Using K-Means Segmentation (Same as app.py)

Downloads **BOTH** Roboflow datasets for maximum training data!

## Step 1: Install Dependencies

In [None]:
!pip install roboflow opencv-python-headless scikit-learn imbalanced-learn xgboost tqdm -q
print("‚úÖ Dependencies installed!")

## Step 2: Download BOTH Datasets from Roboflow

In [None]:
from roboflow import Roboflow

API_KEY = "EPa1uqQkD6BSITYjgrP3"
rf = Roboflow(api_key=API_KEY)

# Download Dataset 1: Durian Own
print("üì• Downloading Dataset 1: Durian Own...")
project1 = rf.workspace("durian-cf87w").project("durian_own")
dataset1 = project1.version(1).download("folder")
print("‚úÖ Dataset 1 downloaded!\n")

# Download Dataset 2: Durian Thesis
print("üì• Downloading Dataset 2: Durian Thesis...")
project2 = rf.workspace("carl-bwzge").project("durian-thesis")
dataset2 = project2.version(3).download("folder")
print("‚úÖ Dataset 2 downloaded!")

print("\nüéâ Both datasets ready!")

In [None]:
# Check downloaded folders
!echo "=== Downloaded Folders ==="
!ls -la
!echo ""
!echo "=== Dataset 1 Structure ==="
!ls -la durian_own*/ 2>/dev/null || echo "Not found with this name"
!echo ""
!echo "=== Dataset 2 Structure ==="
!ls -la durian-thesis*/ 2>/dev/null || echo "Not found with this name"

## Step 3: Set Dataset Paths

**UPDATE THESE** based on the folder names shown above!

In [None]:
import os

# Auto-detect dataset paths
DATASET_PATHS = []

for folder in os.listdir('.'):
    if 'durian' in folder.lower() and os.path.isdir(folder):
        DATASET_PATHS.append(folder)
        print(f"‚úÖ Found: {folder}")

print(f"\nTotal datasets found: {len(DATASET_PATHS)}")

## Step 4: Import Libraries

In [None]:
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb

print("‚úÖ Libraries imported!")

## Step 5: K-Means Segmentation (SAME AS APP.PY)

In [None]:
def get_mask_lab_method(image):
    """K-Means Segmentation - IDENTICAL TO APP.PY (Gamma=0.6, K=3)"""
    img_resized = cv2.resize(image, (512, 512))
    gamma = 0.6
    table = np.array([((i / 255.0) ** (1.0/gamma)) * 255 for i in range(256)]).astype("uint8")
    img_gamma = cv2.LUT(img_resized, table)
    img_lab = cv2.cvtColor(img_gamma, cv2.COLOR_BGR2Lab)
    Z = img_lab.reshape((-1, 3)).astype(np.float32)
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    _, label, center = cv2.kmeans(Z, 3, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
    center = np.uint8(center)
    result_image = center[label.flatten()].reshape((img_lab.shape))
    h, w = result_image.shape[:2]
    center_color = result_image[h//2, w//2]
    mask = cv2.inRange(result_image, center_color, center_color)
    kernel = np.ones((5, 5), np.uint8)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        c = max(contours, key=cv2.contourArea)
        mask_clean = np.zeros_like(mask)
        cv2.drawContours(mask_clean, [c], -1, 255, -1)
        mask = mask_clean
    return mask

print("‚úÖ K-Means function (Gamma=0.6, K=3)")

## Step 6: Feature Extraction (SAME AS APP.PY)

In [None]:
def extract_features(image, mask):
    img = cv2.resize(image, (512, 512))
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours: return None
    cnt = max(contours, key=cv2.contourArea)
    area = cv2.contourArea(cnt)
    perimeter = cv2.arcLength(cnt, True)
    if area == 0 or perimeter == 0: return None
    compactness = (perimeter ** 2) / area
    approx = cv2.approxPolyDP(cnt, 0.01 * perimeter, True)
    smoothness = len(approx) / perimeter
    x, y, w, h = cv2.boundingRect(cnt)
    aspect_ratio = float(w) / h if h > 0 else 0
    rectangularity = area / (w * h) if (w * h) > 0 else 0
    mean_red = cv2.mean(img, mask=mask)[2]
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    mean_hue = cv2.mean(hsv, mask=mask)[0]
    return {'Compactness': compactness, 'Smoothness': smoothness, 'Aspect_Ratio': aspect_ratio,
            'Rectangularity': rectangularity, 'Mean_Red': mean_red, 'Mean_Hue': mean_hue}

print("‚úÖ Feature extraction function")

## Step 7: Process ALL Datasets

In [None]:
def load_all_datasets(dataset_paths):
    all_data = []
    
    for dataset_path in dataset_paths:
        print(f"\nüìÇ Processing: {dataset_path}")
        
        # Walk through all subdirectories
        for root, dirs, files in os.walk(dataset_path):
            images = [f for f in files if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            if not images:
                continue
            
            # Try to extract variety and ripeness from path
            path_parts = root.replace(dataset_path, '').strip('/\\').split(os.sep)
            
            # Default labels (will be overwritten if detected)
            variety = 'Unknown'
            ripeness = 'Unknown'
            
            for part in path_parts:
                part_lower = part.lower()
                # Detect variety
                if 'musang' in part_lower or 'd197' in part_lower:
                    variety = 'D197_MusangKing'
                elif 'black' in part_lower or 'd200' in part_lower:
                    variety = 'D200_BlackThorn'
                elif 'udang' in part_lower or 'd175' in part_lower or 'merah' in part_lower:
                    variety = 'D175_UdangMerah'
                
                # Detect ripeness
                if 'mature' in part_lower and 'immature' not in part_lower:
                    ripeness = 'mature'
                elif 'immature' in part_lower:
                    ripeness = 'immature'
                elif 'defective' in part_lower or 'defect' in part_lower:
                    ripeness = 'defective'
            
            if variety == 'Unknown':
                continue  # Skip if can't determine variety
            
            print(f"  üìÅ {variety}/{ripeness}: {len(images)} images")
            
            for img_file in tqdm(images, desc=f"{variety}/{ripeness}", leave=False):
                try:
                    img_path = os.path.join(root, img_file)
                    image = cv2.imread(img_path)
                    if image is None: continue
                    
                    mask = get_mask_lab_method(image)
                    if cv2.countNonZero(mask) < (512 * 512 * 0.01): continue
                    
                    features = extract_features(image, mask)
                    if features:
                        features['Variety'] = variety
                        features['Ripeness'] = ripeness
                        all_data.append(features)
                except:
                    continue
    
    return pd.DataFrame(all_data)

print("Loading from:", DATASET_PATHS)
df = load_all_datasets(DATASET_PATHS)
print(f"\n‚úÖ Total processed: {len(df)} images!")
print("\nSummary:")
print(df[['Variety', 'Ripeness']].value_counts())

## Step 8: Train Variety Model

In [None]:
print("TRAINING VARIETY MODEL")

X = df[['Compactness', 'Smoothness', 'Aspect_Ratio', 'Rectangularity', 'Mean_Red']]
y = df['Variety']

variety_encoder = LabelEncoder()
y_encoded = variety_encoder.fit_transform(y)
print(f"Classes: {variety_encoder.classes_}")

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

variety_model = VotingClassifier(
    estimators=[('svm', SVC(probability=True, random_state=42)),
                ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
                ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='mlogloss'))],
    voting='soft')
variety_model.fit(X_train_smote, y_train_smote)

print(f"\nüéØ Accuracy: {accuracy_score(y_test, variety_model.predict(X_test))*100:.2f}%")
print(classification_report(y_test, variety_model.predict(X_test), target_names=variety_encoder.classes_))

## Step 9: Train Ripeness Model

In [None]:
# Filter out 'Unknown' ripeness
df_ripe = df[df['Ripeness'] != 'Unknown']
print(f"Training ripeness on {len(df_ripe)} samples")

if len(df_ripe) > 0:
    X = df_ripe[['Mean_Hue', 'Compactness', 'Smoothness']]
    y = df_ripe['Ripeness']

    ripeness_encoder = LabelEncoder()
    y_encoded = ripeness_encoder.fit_transform(y)
    print(f"Classes: {ripeness_encoder.classes_}")

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    ripeness_model = VotingClassifier(
        estimators=[('svm', SVC(probability=True, random_state=42)),
                    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
                    ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='mlogloss'))],
        voting='soft')
    ripeness_model.fit(X_train_smote, y_train_smote)

    print(f"\nüéØ Accuracy: {accuracy_score(y_test, ripeness_model.predict(X_test))*100:.2f}%")
    print(classification_report(y_test, ripeness_model.predict(X_test), target_names=ripeness_encoder.classes_))
else:
    print("‚ö†Ô∏è No ripeness labels found - skipping ripeness model")

## Step 10: Save & Download

In [None]:
OUTPUT = "TRAINING_MODEL"
os.makedirs(OUTPUT, exist_ok=True)

joblib.dump(variety_model, f"{OUTPUT}/variety_model.pkl")
joblib.dump(variety_encoder, f"{OUTPUT}/variety_model_encoder.pkl")
if 'ripeness_model' in dir():
    joblib.dump(ripeness_model, f"{OUTPUT}/ripeness_model.pkl")
    joblib.dump(ripeness_encoder, f"{OUTPUT}/ripeness_model_encoder.pkl")
df.to_csv(f"{OUTPUT}/features.csv", index=False)

print("‚úÖ Models saved!")
!ls -la {OUTPUT}

In [None]:
!zip -r trained_models.zip TRAINING_MODEL/

from google.colab import files
files.download('trained_models.zip')

print("\nüéâ DONE!")
print("1. Extract trained_models.zip")
print("2. Copy .pkl files to 'TRAINING MODEL' folder")
print("3. Restart Flask app")