In [2]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# --- Configuration ---
MODEL_FILE = 'roti_spoiler_pipeline.joblib'
N_SAMPLES = 3500
SEED = 42
np.random.seed(SEED)

print(f"--- Roti Spoilage Detector ML Training Pipeline ---")
print(f"Target model file: {MODEL_FILE}")
print(f"Dataset size: {N_SAMPLES} rows")
print("\n" + "="*50 + "\n")

# ==============================================================================
# 1. DATA GENERATION (Embedding Custom Spoilage Logic)
# ==============================================================================

def generate_synthetic_data(n_samples):
    """
    Generates a synthetic dataset for Roti spoilage detection, embedding the logical constraints.
    roti_state: 1 = Spoiled, 0 = Fresh/Safe
    """
    # Define categories based on typical scenarios
    storage_locations = ['Room Temperature', 'Refrigerator', 'Freezer', 'Open Counter', 'Lunchbox']
    storage_containers = ['Airtight Box', 'Aluminium Foil Wrap', 'Cloth/Basket', 'Ziploc Bag', 'Open Plate']
    fat_contents = ['Low (0-5%)', 'Medium (5-10%)', 'High (>10%)']
    ambient_seasons = ['Warm & Humid', 'Cool & Dry', 'Neutral', 'Monsoon (Very Humid)']
    observed_textures = ['Soft & Pliable', 'Slightly Hardened', 'Dry & Brittle', 'Slimy/Sticky', 'Fuzzy/Mold']
    observed_appearances = ['Golden Brown', 'Lightly Spotted', 'Dark Patches', 'Oil Separation/Condensation', 'Visible Fuzz/Growth']

    # Generate features
    df = pd.DataFrame({
        'time_since_cooking_hr': np.random.uniform(0.5, 72, n_samples).round(1),
        'storage_location': np.random.choice(storage_locations, n_samples, p=[0.4, 0.3, 0.1, 0.1, 0.1]),
        'storage_container': np.random.choice(storage_containers, n_samples, p=[0.3, 0.2, 0.2, 0.1, 0.2]),
        'fat_content': np.random.choice(fat_contents, n_samples, p=[0.4, 0.4, 0.2]),
        'ambient_season': np.random.choice(ambient_seasons, n_samples, p=[0.3, 0.3, 0.3, 0.1]),
        'observed_texture': np.random.choice(observed_textures, n_samples, p=[0.6, 0.2, 0.1, 0.05, 0.05]),
        'observed_appearance': np.random.choice(observed_appearances, n_samples, p=[0.6, 0.2, 0.1, 0.05, 0.05]),
    })

    df['roti_state'] = 0 # Initialize as fresh

    # --- Apply Core Spoilage Logic (Training the model to understand context) ---

    # 1. HARD SPOILAGE RULE (Time/Temp): >24 hours at Room Temp
    rule_1 = (df['time_since_cooking_hr'] > 24) & (df['storage_location'] == 'Room Temperature')
    df.loc[rule_1, 'roti_state'] = 1

    # 2. SENSORY INDICATORS: Strong visual/texture signs = Spoiled
    rule_2 = (df['observed_texture'].isin(['Slimy/Sticky', 'Fuzzy/Mold'])) | \
             (df['observed_appearance'].isin(['Visible Fuzz/Growth', 'Dark Patches']))
    df.loc[rule_2, 'roti_state'] = 1

    # 3. MOISTURE RISK: Condensation + Medium time = Higher Spoilage Chance
    rule_3 = (df['observed_appearance'] == 'Oil Separation/Condensation') & \
             (df['time_since_cooking_hr'] > 6) & (df['time_since_cooking_hr'] <= 48) & \
             (df['roti_state'] == 0)
    df.loc[rule_3, 'roti_state'] = np.random.choice([0, 1], size=rule_3.sum(), p=[0.2, 0.8])

    # 4. LOW RISK: Refrigerated/Frozen (Extremely low spoilage probability)
    rule_4 = df['storage_location'].isin(['Refrigerator', 'Freezer']) & (df['roti_state'] == 0)
    df.loc[rule_4, 'roti_state'] = np.random.choice([0, 1], size=rule_4.sum(), p=[0.99, 0.01])

    # 5. AMBIENT RISK: Warm & Humid / Monsoon + Moderate Time
    rule_5 = df['ambient_season'].isin(['Warm & Humid', 'Monsoon (Very Humid)']) & \
             (df['time_since_cooking_hr'] > 8) & (df['roti_state'] == 0)
    df.loc[rule_5, 'roti_state'] = np.random.choice([0, 1], size=rule_5.sum(), p=[0.7, 0.3])

    # 6. Global Time Max: Anything over 60 hours is spoiled unless frozen/refrigerated
    rule_6 = (df['time_since_cooking_hr'] > 60) & \
             ~df['storage_location'].isin(['Refrigerator', 'Freezer'])
    df.loc[rule_6, 'roti_state'] = 1

    return df.drop_duplicates().reset_index(drop=True)

df = generate_synthetic_data(N_SAMPLES)
print(f"Data Generated successfully. Shape: {df.shape}")
print(f"Spoilage Distribution:\n{df['roti_state'].value_counts(normalize=True)}")
print(df.head())
print("\n" + "="*50 + "\n")

# ==============================================================================
# 2. FEATURE ENGINEERING AND PREPROCESSING
# ==============================================================================

X = df.drop('roti_state', axis=1)
y = df['roti_state']

# Define feature types
numerical_features = ['time_since_cooking_hr']
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")

# Create the preprocessing pipelines
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    # handle_unknown='ignore' ensures robustness when unseen categories appear (though unlikely here)
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough'
)

print("ColumnTransformer set up for scaling numerical features and OHE categorical features.")
print("\n" + "="*50 + "\n")

# ==============================================================================
# 3. MODEL TRAINING AND SELECTION
# ==============================================================================

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)
print(f"Data split into Train ({X_train.shape[0]}) and Test ({X_test.shape[0]}) sets.")

# Define candidate models
models = {
    'Logistic Regression': LogisticRegression(solver='liblinear', random_state=SEED),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=SEED),
    'Gradient Boosting (XGB)': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=SEED)
}

best_model_name = None
best_accuracy = 0
best_pipeline = None

for name, model in models.items():
    print(f"\nTraining {name}...")
    # Create full pipeline (preprocessor + model)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

    # Train
    pipeline.fit(X_train, y_train)

    # Predict
    y_pred = pipeline.predict(X_test)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"  -> Test Accuracy: {accuracy:.4f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name
        best_pipeline = pipeline

print("\n" + "="*50)
print(f"BEST MODEL: {best_model_name} with Accuracy: {best_accuracy:.4f}")
print("Classification Report for Best Model:")
y_pred_best = best_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_best, target_names=['Fresh', 'Spoiled']))
print("="*50 + "\n")


# ==============================================================================
# 4. MODEL PERSISTENCE (Saving the best pipeline)
# ==============================================================================

if best_pipeline:
    # Save the entire pipeline, including preprocessing steps
    joblib.dump(best_pipeline, MODEL_FILE)
    print(f"SUCCESS: The complete ML pipeline has been saved to '{MODEL_FILE}'.")
else:
    print("ERROR: No model pipeline was selected or saved.")

--- Roti Spoilage Detector ML Training Pipeline ---
Target model file: roti_spoiler_pipeline.joblib
Dataset size: 3500 rows


Data Generated successfully. Shape: (3493, 8)
Spoilage Distribution:
roti_state
1    0.542227
0    0.457773
Name: proportion, dtype: float64
   time_since_cooking_hr  storage_location    storage_container  \
0                   27.3  Room Temperature         Airtight Box   
1                   68.5      Open Counter         Airtight Box   
2                   52.8          Lunchbox           Open Plate   
3                   43.3  Room Temperature           Ziploc Bag   
4                   11.7      Open Counter  Aluminium Foil Wrap   

      fat_content ambient_season   observed_texture observed_appearance  \
0      Low (0-5%)   Warm & Humid     Soft & Pliable     Lightly Spotted   
1      Low (0-5%)   Warm & Humid     Soft & Pliable        Golden Brown   
2  Medium (5-10%)        Neutral       Slimy/Sticky     Lightly Spotted   
3      Low (0-5%)     Cool & D