In [8]:
# 📦 Importing Required Libraries
import pandas as pd
import numpy as np  # for numerical data
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# 📂 Load the Dataset
df = pd.read_excel("SupplyChainEmissionFactorsforUSIndustriesCommodities.xlsx", sheet_name="2010_Summary_Commodity")

# 🧹 Data Cleaning - Drop non-numeric and unnecessary columns
df_cleaned = df.drop(columns=[
    'Commodity Code', 'Commodity Name', 'Substance', 'Unit', 'Unnamed: 7'
])

# Remove rows with missing values
df_cleaned = df_cleaned.dropna()

# 🎯 Define Features and Target
X = df_cleaned.drop(columns=['Supply Chain Emission Factors with Margins'])
y = df_cleaned['Supply Chain Emission Factors with Margins']

# 🔄 Standardize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ✂️ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 🧠 Train a Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# 📈 Make Predictions
y_pred = model.predict(X_test)

# 🧮 Evaluate the Model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("✅ Model Performance:")
print("Mean Squared Error (MSE):", mse)
print("R-squared (R²):", r2)


✅ Model Performance:
Mean Squared Error (MSE): 5.958720375450042e-08
R-squared (R²): 0.9999989488410446


In [9]:

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


In [10]:
y_pred_best = best_model.predict(X_test)

print("Final RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_best)))
print("Final R²:", r2_score(y_test, y_pred_best))

Final RMSE: 0.03833279820605802
Final R²: 0.9740787205134163


In [13]:
!mkdir models

In [14]:
# Save model and encoders
joblib.dump(best_model, 'models/final_model.pkl')   
joblib.dump(scaler, 'models/scaler.pkl')


['models/scaler.pkl']