In [None]:

# Greenhouse Gas Emissions Prediction from Supply Chain Data

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# -------------------------------
# Load Data Function
# -------------------------------
def load_data(file_path):
    try:
        df = pd.read_excel(file_path)
        print("Data loaded successfully.")
        return df
    except FileNotFoundError:
        print("File not found.")
        return None

# -------------------------------
# Preprocess Data Function
# -------------------------------
def preprocess_data(df):
    print("Initial shape:", df.shape)

    # Drop rows with any missing values
    df = df.dropna()

    # Convert categorical variables if necessary
    df = pd.get_dummies(df, drop_first=True)

    print("Processed shape:", df.shape)
    return df

# -------------------------------
# Train and Evaluate Model
# -------------------------------
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', RandomForestRegressor(random_state=42))
    ])

    # Define parameter grid
    param_grid = {
        'regressor__n_estimators': [100, 200],
        'regressor__max_depth': [None, 10, 20]
    }

    # Grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Evaluation
    y_pred = grid_search.predict(X_test)
    print("Best Parameters:", grid_search.best_params_)
    print("R² Score:", r2_score(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))

    # Save model
    joblib.dump(grid_search.best_estimator_, "ghg_emission_model.pkl")

    # Plot actual vs predicted
    plt.figure(figsize=(6,6))
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.xlabel("Actual Emissions")
    plt.ylabel("Predicted Emissions")
    plt.title("Actual vs Predicted Emissions")
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Feature importances
    model = grid_search.best_estimator_.named_steps['regressor']
    importances = model.feature_importances_
    feature_names = X.columns
    sorted_idx = np.argsort(importances)[::-1]

    plt.figure(figsize=(10, 6))
    plt.barh(range(len(importances)), importances[sorted_idx], align='center')
    plt.yticks(range(len(importances)), np.array(feature_names)[sorted_idx])
    plt.xlabel("Feature Importance")
    plt.title("Feature Importances from Random Forest")
    plt.tight_layout()
    plt.show()

# -------------------------------
# Main Execution
# -------------------------------
file_path = 'SupplyChainEmissionFactorsforUSIndustriesCommodities.xlsx'
df = load_data(file_path)

if df is not None:
    df = preprocess_data(df)

    # Assuming the target variable is named like this (adjust if different)
    target_col = 'Supply Chain Emission Factor with Margin'

    if target_col in df.columns:
        X = df.drop(columns=[target_col])
        y = df[target_col]
        train_model(X, y)
    else:
        print(f"Target column '{target_col}' not found in dataset.")
