# Crop Spoilage Prediction - Machine Learning Pipeline

This notebook develops a machine learning pipeline to predict crop spoilage conditions in Ghana using environmental data.

## Project Overview
- **Regression Task**: Predict spoilage time in days
- **Classification Task**: Assess spoilage risk level (low, medium, high)
- **Data**: ~5,000 samples with environmental variables for common Ghanaian crops
- **Models**: Random Forest Regressor and Classifier with hyperparameter tuning

---

## 1. Import Libraries and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")

## 2. Load and Explore Dataset

In [None]:
# Load dataset
df = pd.read_csv("ghana_crop_spoilage_dataset.csv")

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Dataset overview
print("Column Information:")
print(df.info())
print("\nFirst 5 rows:")
df.head()

## 3. Basic Statistics and Data Quality

In [None]:
# Basic statistics
print("Descriptive Statistics:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing_vals = df.isnull().sum()
print(missing_vals[missing_vals > 0] if missing_vals.sum() > 0 else "No missing values found!")

In [None]:
# Categorical variables analysis
print("Crop Types Distribution:")
print(df['crop'].value_counts())
print("\nSpoilage Risk Distribution:")
print(df['spoilage_risk'].value_counts())

## 4. Data Preprocessing

In [None]:
# Encode crop type
le = LabelEncoder()
df["crop_encoded"] = le.fit_transform(df["crop"])

# Save label encoder for future uses
joblib.dump(le, "crop_label_encoder_trial.pkl")

print("Crop Encoding Mapping:")
crop_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
for crop, code in crop_mapping.items():
    print(f"{crop}: {code}")

In [None]:
# Define features and targets
features = ["temperature", "humidity", "moisture", "crop_encoded"]
X = df[features]
y_reg = df["spoilage_days"]
y_clf = df["spoilage_risk"]      

print(f"Features: {features}")
print(f"Regression target: spoilage_days")
print(f"Classification target: spoilage_risk")

## 5. Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = df[features + ["spoilage_days"]].corr()
print("Correlation with spoilage_time_days:")
spoilage_time_corr = correlation_matrix["spoilage_days"].sort_values(ascending=False)
print(spoilage_time_corr)

In [None]:
# Statistical significance of correlations
print("Correlation significance tests:")
print("*** p<0.001, ** p<0.01, * p<0.05")
print("-" * 40)
for feature in features:
    corr_coef, p_value = pearsonr(df[feature], df["spoilage_days"])
    significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
    print(f"{feature:15}: r={corr_coef:6.3f}, p={p_value:.3f} {significance}")

## 6. Data Visualization

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(3, 2, figsize=(15, 18))
fig.suptitle('Crop Spoilage Dataset Analysis', fontsize=16, fontweight='bold')

# 1. Distribution of spoilage time
axes[0, 0].hist(df['spoilage_days'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Distribution of Spoilage Time (Days)')
axes[0, 0].set_xlabel('Days')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['spoilage_days'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["spoilage_days"].mean():.1f}')
axes[0, 0].legend()

# 2. Spoilage risk distribution
spoilage_counts = df['spoilage_risk'].value_counts()
colors = ['green', 'orange', 'red']
bars = axes[0, 1].bar(spoilage_counts.index, spoilage_counts.values, color=colors)
axes[0, 1].set_title('Spoilage Risk Distribution')
axes[0, 1].set_ylabel('Count')
for bar, count in zip(bars, spoilage_counts.values):
    axes[0, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20, 
                    str(count), ha='center', va='bottom')

# 3. Correlation heatmap
corr_matrix = df[features + ["spoilage_days"]].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, ax=axes[1, 0], cbar_kws={'label': 'Correlation Coefficient'})
axes[1, 0].set_title('Feature Correlation Heatmap')

# 4. Spoilage time by crop type
df.boxplot(column='spoilage_days', by='crop', ax=axes[1, 1])
axes[1, 1].set_title('Spoilage Time by Crop Type')
axes[1, 1].set_xlabel('Crop Type')
axes[1, 1].set_ylabel('Spoilage Time (Days)')
plt.setp(axes[1, 1].xaxis.get_majorticklabels(), rotation=45)

# 5. Temperature vs Spoilage Time (colored by humidity)
scatter = axes[2, 0].scatter(df['temperature'], df['spoilage_days'], 
                           c=df['humidity'], cmap='viridis', alpha=0.6)
axes[2, 0].set_title('Temperature vs Spoilage Time (colored by humidity)')
axes[2, 0].set_xlabel('Temperature (°C)')
axes[2, 0].set_ylabel('Spoilage Time (Days)')
cbar = plt.colorbar(scatter, ax=axes[2, 0])
cbar.set_label('Humidity (%)')

#6. Deviation score impact
axes[2, 1].scatter(df['deviation_score'], df['spoilage_days'], alpha=0.6, color='purple')
axes[2, 1].set_title('Deviation Score vs Spoilage Time')
axes[2, 1].set_xlabel('Deviation Score')
axes[2, 1].set_ylabel('Spoilage Time (Days)')

plt.tight_layout()
plt.show()

## 7. Train/Test Split

In [None]:
# Split data for regression and classification
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42)

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X, y_clf, test_size=0.2, random_state=42)

print(f"Training set size (regression): {X_train_reg.shape[0]}")
print(f"Test set size (regression): {X_test_reg.shape[0]}")
print(f"Training set size (classification): {X_train_clf.shape[0]}")
print(f"Test set size (classification): {X_test_clf.shape[0]}")

## 8. Regression Model Training

In [None]:
# Define regression model and hyperparameters
regressor = RandomForestRegressor(random_state=42)
reg_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

print("Starting regression model training with GridSearchCV...")
print(f"Parameter combinations to test: {len(reg_params['n_estimators']) * len(reg_params['max_depth']) * len(reg_params['min_samples_split']) * len(reg_params['min_samples_leaf'])}")

In [None]:
# Train regression model
grid_reg = GridSearchCV(regressor, reg_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_reg.fit(X_train_reg, y_train_reg)
best_reg = grid_reg.best_estimator_

print("Regression model training completed!")
print(f"Best parameters: {grid_reg.best_params_}")
print(f"Best CV score: {-grid_reg.best_score_:.3f}")

## 9. Regression Model Evaluation

In [None]:
# Evaluate regression model
y_pred_reg = best_reg.predict(X_test_reg)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
rmse = root_mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print("=" * 50)
print("REGRESSION MODEL RESULTS")
print("=" * 50)
print(f"Mean Absolute Error (MAE): {mae:.3f} days")
print(f"Root Mean Square Error (RMSE): {rmse:.3f} days")
print(f"R² Score: {r2:.3f}")
print(f"Model explains {r2*100:.1f}% of the variance in spoilage time")

In [None]:
# Feature importance for regression
feature_importance_reg = pd.DataFrame({
    'feature': features,
    'importance': best_reg.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance (Regression):")
print(feature_importance_reg)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_reg['feature'], feature_importance_reg['importance'])
plt.title('Feature Importance - Regression Model')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## 10. Classification Model Training

In [None]:
# Define classification model and hyperparameters
classifier = RandomForestClassifier(random_state=42)
clf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

print("Starting classification model training with GridSearchCV...")
print(f"Parameter combinations to test: {len(clf_params['n_estimators']) * len(clf_params['max_depth']) * len(clf_params['min_samples_split']) * len(clf_params['min_samples_leaf'])}")

In [None]:
# Train classification model
grid_clf = GridSearchCV(classifier, clf_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_clf.fit(X_train_clf, y_train_clf)
best_clf = grid_clf.best_estimator_

print("Classification model training completed!")
print(f"Best parameters: {grid_clf.best_params_}")
print(f"Best CV score: {grid_clf.best_score_:.3f}")

## 11. Classification Model Evaluation

In [None]:
# Evaluate classification model
y_pred_clf = best_clf.predict(X_test_clf)
accuracy = accuracy_score(y_test_clf, y_pred_clf)

print("=" * 50)
print("CLASSIFICATION MODEL RESULTS")
print("=" * 50)
print(f"Accuracy: {accuracy:.3f} ({accuracy*100:.1f}% correct predictions)")
print("\nDetailed Classification Report:")
print(classification_report(y_test_clf, y_pred_clf))

In [None]:
# Feature importance for classification
feature_importance_clf = pd.DataFrame({
    'feature': features,
    'importance': best_clf.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance (Classification):")
print(feature_importance_clf)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_clf['feature'], feature_importance_clf['importance'])
plt.title('Feature Importance - Classification Model')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## 12. Model Persistence

In [None]:
# Save trained models
joblib.dump(best_reg, "spoilage_time_regressor_enhanced.pkl")
joblib.dump(best_clf, "spoilage_risk_classifier_enhanced.pkl")
joblib.dump(le, "crop_label_encoder.pkl")

print("Models saved successfully!")
print("Files created:")
print("- spoilage_time_regressor_enhanced.pkl")
print("- spoilage_risk_classifier_enhanced.pkl")
print("- crop_label_encoder.pkl")

## 13. Prediction Example

In [None]:
# Example prediction for a new sample
sample_data = {
    'temperature': 28.5,
    'humidity': 65.0,
    'moisture': 12.0,
    'crop_encoded': 0 
}

sample_df = pd.DataFrame([sample_data])
predicted_days = best_reg.predict(sample_df)[0]
predicted_risk = best_clf.predict(sample_df)[0]
predicted_risk_proba = best_clf.predict_proba(sample_df)[0]

print("=" * 50)
print("PREDICTION EXAMPLE")
print("=" * 50)
print(f"Input conditions:")
for key, value in sample_data.items():
    print(f"  {key}: {value}")
print(f"\nPredictions:")
print(f"  Spoilage time: {predicted_days:.1f} days")
print(f"  Risk level: {predicted_risk}")
print(f"  Risk probabilities: {dict(zip(best_clf.classes_, predicted_risk_proba))}")

## 14. Summary and Insights

In [None]:
print("=" * 60)
print("SUMMARY INSIGHTS")
print("=" * 60)

print(f"Dataset Overview:")
print(f"  - {len(df):,} samples across {len(df['crop'].unique())} crop types")
print(f"  - Average spoilage time: {df['spoilage_days'].mean():.1f} ± {df['spoilage_days'].std():.1f} days")
print(f"  - Temperature range: {df['temperature'].min():.1f}°C - {df['temperature'].max():.1f}°C")
print(f"  - Humidity range: {df['humidity'].min():.1f}% - {df['humidity'].max():.1f}%")

print(f"\nKey Predictors:")
print(f"  - Strongest regression predictor: {feature_importance_reg.iloc[0]['feature']}")
print(f"  - Strongest classification predictor: {feature_importance_clf.iloc[0]['feature']}")

print(f"\nModel Performance:")
print(f"  - Regression R²: {r2:.3f} (explains {r2*100:.1f}% of variance)")
print(f"  - Classification Accuracy: {accuracy:.3f} ({accuracy*100:.1f}% correct predictions)")

print(f"\nBusiness Impact:")
print(f"  - Average prediction error: ±{mae:.1f} days")
print(f"  - System can classify risk levels with {accuracy*100:.1f}% accuracy")
print(f"  - Models ready for API integration and real-time monitoring")