In [28]:
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# --- 1. Data Synthesis based on Dal Spoilage Logic ---
def create_synthetic_dal_data(n_rows=5000):
    np.random.seed(42)
    
    # Define categories
    storage_places = ['Room Temperature', 'Refrigerator', 'Freezer']
    acidity_sources = ['Low/Normal', 'Moderate', 'High']
    consistencies = ['Normal', 'Slightly Thickened', 'Watery', 'Slimy']
    container_types = ['Steel/Metal', 'Plastic', 'Ceramic/Glass']
    smells = ['Normal', 'Slightly Sour', 'Very Sour', 'Musty', 'Foul']

    data = {
        'Time_since_preparation_hours': np.random.uniform(0, 120, n_rows),
        'Storage_place': np.random.choice(storage_places, n_rows, p=[0.4, 0.55, 0.05]),
        'Acidity_source': np.random.choice(acidity_sources, n_rows, p=[0.6, 0.3, 0.1]),
        'Consistency': np.random.choice(consistencies, n_rows, p=[0.7, 0.2, 0.08, 0.02]),
        'Container_type': np.random.choice(container_types, n_rows, p=[0.5, 0.3, 0.2]),
        'Smell': np.random.choice(smells, n_rows, p=[0.65, 0.25, 0.05, 0.03, 0.02]),
        'Oil_separation': np.random.uniform(0.0, 1.0, n_rows)
    }

    df = pd.DataFrame(data)

    # Apply real-world spoilage rules to generate 'Spoiled_flag' (Target)
    def determine_spoilage(row):
        # Rule 1: Long time at Room Temp (>24hrs)
        if row['Storage_place'] == 'Room Temperature' and row['Time_since_preparation_hours'] > 24:
            return 'Spoiled'
        
        # Rule 2: Moderate time at Room Temp + High Acidity/Bad Smell (>8hrs)
        if row['Storage_place'] == 'Room Temperature' and row['Time_since_preparation_hours'] > 8 and \
           (row['Acidity_source'] == 'High' or row['Smell'] in ['Very Sour', 'Foul']):
            return 'Spoiled'
        
        # Rule 3: Extreme indicators (Slimy or Foul Smell)
        if row['Consistency'] == 'Slimy' or row['Smell'] == 'Foul':
            return 'Spoiled'

        # Rule 4: Very long time in Refrigerator + High Acidity (>72hrs)
        if row['Storage_place'] == 'Refrigerator' and row['Time_since_preparation_hours'] > 72 and \
           row['Acidity_source'] == 'High':
            return 'Spoiled'

        # Baseline: Use a probabilistic model based on time and oil separation
        time_factor = row['Time_since_preparation_hours'] / 120.0
        oil_factor = row['Oil_separation']
        
        # Higher probability of spoilage if time is long and oil separation is high
        spoil_prob = time_factor * 0.4 + oil_factor * 0.3
        
        # Adjust probability based on storage
        if row['Storage_place'] == 'Refrigerator':
            spoil_prob *= 0.5
        elif row['Storage_place'] == 'Freezer':
            spoil_prob *= 0.1
        
        if np.random.rand() < spoil_prob:
            return 'Spoiled'
        
        return 'Not Spoiled'

    df['Spoiled_flag'] = df.apply(determine_spoilage, axis=1)
    
    return df

# Create the data or load your existing data (uncomment the line below if you use your 5000 rows)
# df = pd.read_csv('dal_spoilage_data.csv')
df = create_synthetic_dal_data(n_rows=5000)

print(f"Generated data shape: {df.shape}")
print("Spoilage distribution:")
print(df['Spoiled_flag'].value_counts())

Generated data shape: (5000, 8)
Spoilage distribution:
Spoiled_flag
Not Spoiled    2624
Spoiled        2376
Name: count, dtype: int64


In [33]:
# --- 2. CORRECTED Preprocessing and Splitting ---

# Define columns by type
numerical_cols = ['Time_since_preparation_hours', 'Oil_separation']
categorical_cols = ['Storage_place', 'Acidity_source', 'Consistency', 'Container_type', 'Smell']

# Ensure categorical columns are explicitly of type 'category'.
df[categorical_cols] = df[categorical_cols].astype('category')

# Create the Column Transformer
# Removed 'sparse_output=False' for older scikit-learn version compatibility.
preprocessor = ColumnTransformer(
    transformers=[
        # Use StandardScaler for numerical features
        ('num', StandardScaler(), numerical_cols),
        # Use OneHotEncoder for categorical features. 
        # handle_unknown='ignore' prevents errors during transformation if a new category appears.
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='drop'
)

# Encode the target variable
le = LabelEncoder()
df['Spoiled_flag_Encoded'] = le.fit_transform(df['Spoiled_flag'])

X = df.drop(['Spoiled_flag', 'Spoiled_flag_Encoded'], axis=1)
y = df['Spoiled_flag_Encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Fit the preprocessor to the training data and transform all data
try:
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    print("Preprocessing successful.")
    
    # Get feature names after one-hot encoding for the final model (used for saving)
    # Note: 'get_feature_names_out' might also require a newer scikit-learn version.
    # We will try to get feature names for robust saving.
    try:
        feature_names = preprocessor.get_feature_names_out()
    except AttributeError:
        # Fallback if get_feature_names_out is not available in your version
        print("Warning: Could not use preprocessor.get_feature_names_out(). Feature name saving may be affected.")
        feature_names = None
        
except Exception as e:
    print(f"ERROR during Preprocessing: {e}")
    print("Please check data types and ensure all required libraries are imported (StandardScaler, OneHotEncoder).")
    raise # Re-raise the error for debugging

# IMPORTANT: Ensure you have imported OneHotEncoder at the top of your script/notebook:
# from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

Preprocessing successful.


In [34]:
# --- 3. Tuned XGBoost Training with Grid Search ---

# Define the model parameters to search
param_grid = {
    'n_estimators': [150, 200, 300],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3],
}

# Initialize XGBoost model
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    # scale_pos_weight is helpful for class imbalance, but we assume it's relatively balanced for now
)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5, # 5-fold Cross-Validation
    verbose=1,
    n_jobs=-1
)

print("Starting GridSearchCV for XGBoost...")
grid_search.fit(X_train_processed, y_train)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Final Evaluation
y_pred_tuned_xgb = best_xgb_model.predict(X_test_processed)
accuracy_tuned_xgb = accuracy_score(y_test, y_pred_tuned_xgb)

print("\n--- Tuned XGBoost Results ---")
print(f"Best Hyperparameters found: {grid_search.best_params_}")
print(f"Tuned XGBoost Test Accuracy: **{accuracy_tuned_xgb:.4f}**")
print("\nClassification Report (0=Not Spoiled, 1=Spoiled):")
print(classification_report(y_test, y_pred_tuned_xgb, target_names=le.classes_))

Starting GridSearchCV for XGBoost...
Fitting 5 folds for each of 36 candidates, totalling 180 fits

--- Tuned XGBoost Results ---
Best Hyperparameters found: {'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200}
Tuned XGBoost Test Accuracy: **0.9050**

Classification Report (0=Not Spoiled, 1=Spoiled):
              precision    recall  f1-score   support

 Not Spoiled       0.85      1.00      0.92       525
     Spoiled       0.99      0.80      0.89       475

    accuracy                           0.91      1000
   macro avg       0.92      0.90      0.90      1000
weighted avg       0.92      0.91      0.90      1000



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [35]:
# --- 4. Saving All Components for Deployment ---

# Save the Tuned XGBoost model
joblib.dump(best_xgb_model, 'dal_spoilage_final_model.joblib')

# Save the entire preprocessor object (contains scaling and one-hot encoding logic)
joblib.dump(preprocessor, 'dal_spoilage_preprocessor.joblib')

# Save the LabelEncoder
joblib.dump(le, 'dal_spoilage_label_encoder.joblib')

# Save the feature names (needed if you need to inspect features, but less critical than the preprocessor)
joblib.dump(feature_names, 'dal_spoilage_feature_names.joblib')

print("\nModel training and saving complete. Files: 'dal_spoilage_final_model.joblib', 'dal_spoilage_preprocessor.joblib', etc. are ready.")


Model training and saving complete. Files: 'dal_spoilage_final_model.joblib', 'dal_spoilage_preprocessor.joblib', etc. are ready.


In [None]:
import joblib
import json

# Load the preprocessor you already trained and saved
# preprocessor = joblib.load("ML/dal/dal_spoilage_preprocessor.joblib")

# Extract feature names
columns = list(preprocessor.get_feature_names_out())

# Save them to JSON
with open("ML/dal/dal_model_columns.json", "w") as f:
    json.dump(columns, f, indent=4)

print("âœ… dal_model_columns.json created successfully.")


FileNotFoundError: [Errno 2] No such file or directory: 'ML/dal/dal_spoilage_preprocessor.joblib'