In [None]:

# GHG Emissions Prediction using Random Forest

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import logging

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

# Setup logging
logging.basicConfig(level=logging.INFO)

# Set path to Excel file
excel_file = 'SupplyChainEmissionFactorsforUSIndustriesCommodities.xlsx'  # Make sure to place this file correctly

# Check if file exists
if not os.path.exists(excel_file):
    raise FileNotFoundError(f"The file '{excel_file}' was not found. Please check the path or upload the file.")

# Load data across multiple years
years = range(2010, 2017)
all_data = []

for year in years:
    try:
        df = pd.read_excel(excel_file, sheet_name=f'{year}_Detail_Commodity')
        df['Year'] = year
        all_data.append(df)
        logging.info(f"Loaded data for year {year}")
    except Exception as e:
        logging.warning(f"Error processing year {year}: {e}")

# Concatenate all data
if not all_data:
    raise ValueError("No data was loaded. Please check the Excel sheets.")

df = pd.concat(all_data, ignore_index=True)
logging.info("All yearly data loaded and concatenated.")

# Drop unnecessary columns and handle missing values
df.dropna(inplace=True)
df = df.select_dtypes(include=[np.number])  # Keep only numeric columns

# Basic correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# Define features and target (example target: 'Total Emissions'; replace as appropriate)
target_column = 'Total Emissions'  # Change this to the actual target column name in your dataset

if target_column not in df.columns:
    raise ValueError(f"'{target_column}' column not found in the dataset.")

X = df.drop(columns=[target_column])
y = df[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model training with Random Forest
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf_model.predict(X_test_scaled)

# Evaluation
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("R-squared (R²):", r2_score(y_test, y_pred))

# Save the model
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Model and scaler saved.")
