#### Random Forest

In [1]:
#import necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor



import warnings
warnings.filterwarnings("ignore")

print("Libraries imported successfully")


Libraries imported successfully


#### Data Import and Inspection

The dataset was loaded and explored to understand its structure, data types, and completeness. Missing values and feature distributions were checked to guide preprocessing.

In [2]:
# Load dataset

df_rf = pd.read_csv('commodity_data_4_modeling.csv', index_col=0)

print("DataFrame shape:", df_rf.shape)
print("\nFirst 5 rows:")
print(df_rf.head())
print("\nColumn names:")
print(df_rf.columns.tolist())
print("\nData types:")
print(df_rf.dtypes)
print("\nMissing values:")
print(df_rf.isnull().sum())


DataFrame shape: (2473, 8)

First 5 rows:
                   commodity_name        date units_of_measure  \
0                           Sugar  2012-11-01             1 Kg   
1           Wheat Flour - General  2012-11-01             2 Kg   
2                 Cooking Oil/Fat  2012-11-01             1 KG   
3  Maize Flour - Sifted/Fortified  2012-11-01             2 Kg   
4                        Tomatoes  2012-11-01             1 Kg   

   average_price_previous_year  average_price_previous_month  \
0                       173.40                        119.00   
1                       149.91                        133.62   
2                       241.15                        229.05   
3                       117.33                        113.62   
4                        69.06                         58.52   

   current_average_price  %_monthly_change  %_yearly_change  
0                 119.73               0.6            -30.9  
1                 134.91               1.0          

#### Data Preprocessing and Feature Engineering

The date column was converted to datetime, and new time-based features (year, month, quarter) were extracted.
Categorical columns (commodity name, unit of measure) were label-encoded.
Seasonal flags (rainy and dry) were added, and missing values were filled with zeros.
Irrelevant columns were dropped to retain only meaningful predictive features.

In [3]:
# Convert date column to datetime format
df_rf['date'] = pd.to_datetime(df_rf['date'], errors='coerce')

# Extract useful time-based features
df_rf['year'] = df_rf['date'].dt.year
df_rf['month'] = df_rf['date'].dt.month
df_rf['quarter'] = df_rf['date'].dt.quarter

# Encode categorical variables using LabelEncoder
commodity_encoder = LabelEncoder()
df_rf['commodity_code'] = commodity_encoder.fit_transform(df_rf['commodity_name'])

unit_encoder = LabelEncoder()
df_rf['unit_code'] = unit_encoder.fit_transform(df_rf['units_of_measure'])

# Add seasonal indicators
rainy_months = [3, 4, 5, 10, 11]
df_rf['rainy_season_flag'] = df_rf['month'].isin(rainy_months).astype(int)
df_rf['dry_season_flag'] = (~df_rf['month'].isin(rainy_months)).astype(int)

# Handle missing data
df_rf.fillna(0, inplace=True)

# Select feature columns (exclude target and raw categorical variables)
excluded_cols = ['current_average_price', 'date', 'commodity_name', 'units_of_measure']
features = [col for col in df_rf.columns if col not in excluded_cols]

# Split into input features and target variable
X = df_rf[features]
y = df_rf['current_average_price']

# Partition the dataset for training and evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")


Training set size: (1978, 11)
Test set size: (495, 11)


In [4]:
#### Model Training and Evaluation

In [5]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

print("\nTraining Random Forest model...")
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RANDOM FOREST PERFORMANCE\n")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")



Training Random Forest model...
RANDOM FOREST PERFORMANCE

MAE: 17.44
RMSE: 53.23
R²: 0.8987


The Random Forest model demonstrates strong predictive performance based on the evaluation metrics. An R² score of 0.898 indicates that the model explains about 89.8% of the variance in commodity prices, suggesting a very good fit. The Mean Absolute Error (MAE) of 17.44 shows that, on average, the model’s predictions deviate from the actual prices by around 17 units, which is reasonably low given the price range. The Root Mean Squared Error (RMSE) of 53.23 is slightly higher, implying that while most predictions are close to the true values, a few larger errors exist. Overall, the model performs well and captures the underlying patterns in the data effectively.

#### Data Leakage Prevention

In [6]:
# Define which features leak future info 
leaky_features = [
    'average_price_previous_year',
    'average_price_previous_month',
    '%_monthly_change',
    '%_yearly_change'
]

# Make sure feature_columns is defined
excluded_cols = ['current_average_price', 'date', 'commodity_name', 'units_of_measure']
feature_columns = [col for col in df_rf.columns if col not in excluded_cols]

# Drop leaky ones
feature_columns_safe = [col for col in feature_columns if col not in leaky_features]

# Prepare data
X_safe = df_rf[feature_columns_safe]
y = df_rf['current_average_price']

# Split
X_train_safe, X_test_safe, y_train_safe, y_test_safe = train_test_split(
    X_safe, y, test_size=0.2, random_state=42
)

# Train model
rf_safe = RandomForestRegressor(
    random_state=42,
    n_estimators=200,
    max_depth=10,
    n_jobs=-1
)
rf_safe.fit(X_train_safe, y_train_safe)

# Predict
y_pred_safe = rf_safe.predict(X_test_safe)

# Evaluate
r2_safe = r2_score(y_test_safe, y_pred_safe)
mae_safe = mean_absolute_error(y_test_safe, y_pred_safe)
rmse_safe = np.sqrt(mean_squared_error(y_test_safe, y_pred_safe))

print("\n" + "="*50)
print("HONEST RANDOM FOREST PERFORMANCE (NO DATA LEAKAGE)")
print("="*50)
print(f"R²: {r2_safe:.4f}")
print(f"MAE: {mae_safe:.2f}")
print(f"RMSE: {rmse_safe:.2f}")

print("Feature columns safe:", feature_columns_safe)
print("Number of columns:", len(feature_columns_safe))




HONEST RANDOM FOREST PERFORMANCE (NO DATA LEAKAGE)
R²: 0.8831
MAE: 28.80
RMSE: 57.20
Feature columns safe: ['year', 'month', 'quarter', 'commodity_code', 'unit_code', 'rainy_season_flag', 'dry_season_flag']
Number of columns: 7


#### Feature Importance Analysis
Feature importance scores were computed and visualized, showing which variables contributed most to price prediction — mainly time features and commodity categories.Feature importance scores were computed and visualized, showing which variables contributed most to price prediction — mainly time features and commodity categories.

importance_df = pd.DataFrame({
    'feature': feature_columns_safe,
    'importance': rf_safe.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(importance_df)

plt.figure(figsize=(10,6))
sns.barplot(x='importance', y='feature', data=importance_df)
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()


#### Model Optimization
Grid search with cross-validation fine-tuned hyperparameters for better performance and generalization.
The best model was selected and evaluated on the test set.

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [6, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid,
    scoring='r2',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_safe, y_train_safe)

print(f"\nBest R²: {grid_search.best_score_:.4f}")
print(f"Best parameters: {grid_search.best_params_}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [None]:
best_rf = grid_search.best_estimator_
y_pred_final = best_rf.predict(X_test_safe)

r2_final = r2_score(y_test_safe, y_pred_final)
mae_final = mean_absolute_error(y_test_safe, y_pred_final)
rmse_final = np.sqrt(mean_squared_error(y_test_safe, y_pred_final))

print("\n" + "="*50)
print("FINAL RANDOM FOREST PERFORMANCE")
print("="*50)
print(f"R²: {r2_final:.4f}")
print(f"MAE: {mae_final:.2f}")
print(f"RMSE: {rmse_final:.2f}")


In [None]:
import joblib
joblib.dump(best_rf, 'random_forest_commodity_price_predictor.pkl')
print("\nModel saved as 'random_forest_commodity_price_predictor.pkl'")


In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test_safe, y_pred_final, alpha=0.6)
plt.plot([y_test_safe.min(), y_test_safe.max()], [y_test_safe.min(), y_test_safe.max()], 'r--')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Random Forest: Actual vs Predicted')
plt.show()
