### **MODEL TRAINING**

#### 1.1 *Import Libraries*

In [1]:
### Basic Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Modelling 
from sklearn.metrics import r2_score,mean_absolute_error,root_mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import warnings

#### *Import CSV as Pandas DataFrame*

In [2]:
df=pd.read_csv("../data/Cleaned_Carbon_Emission.csv")
df.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency,Recycling,Cooking_With,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,lpg,often,230,frequently,210,large,4,7,26,1,No,['Metal'],"['Stove', 'Oven']",2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,lpg,often,114,rarely,9,extra large,3,9,38,5,No,['Metal'],"['Stove', 'Microwave']",1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,14,47,6,Sometimes,['Metal'],"['Oven', 'Microwave']",2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,lpg,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,3,5,6,Yes,['Paper'],['Oven'],4743


### Prepare X and y Variable


In [3]:
X=df.drop(columns=['CarbonEmission'],axis=1)
X

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency,Recycling,Cooking_With
0,overweight,female,pescatarian,daily,coal,public,lpg,often,230,frequently,210,large,4,7,26,1,No,['Metal'],"['Stove', 'Oven']"
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,lpg,often,114,rarely,9,extra large,3,9,38,5,No,['Metal'],"['Stove', 'Microwave']"
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,14,47,6,Sometimes,['Metal'],"['Oven', 'Microwave']"
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,lpg,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']"
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,3,5,6,Yes,['Paper'],['Oven']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,obese,male,omnivore,twice a day,coal,private,hybrid,sometimes,230,never,268,medium,5,12,27,9,Yes,[],['Microwave']
9996,normal,female,vegan,twice a day,coal,private,lpg,never,234,frequently,5316,extra large,3,14,8,24,Sometimes,"['Paper', 'Plastic']","['Stove', 'Microwave']"
9997,overweight,female,vegetarian,daily,electricity,walk/bicycle,lpg,sometimes,298,very frequently,96,extra large,5,11,5,24,Yes,"['Paper', 'Plastic', 'Metal']","['Microwave', 'Grill', 'Airfryer']"
9998,underweight,male,vegan,more frequently,coal,private,petrol,often,179,rarely,8688,medium,5,19,14,5,Sometimes,"['Paper', 'Metal']","['Stove', 'Microwave', 'Grill', 'Airfryer']"


In [4]:
y=df['CarbonEmission']
y

0       2238
1       1892
2       2595
3       1074
4       4743
        ... 
9995    2408
9996    3084
9997    2377
9998    4574
9999     826
Name: CarbonEmission, Length: 10000, dtype: int64

In [5]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [6]:
X=preprocessor.fit_transform(X)

Splitting The Dataset Into Training Data And Testing Data

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

Create an Evaluate Function to give all metrics after model Training


In [8]:
def evaluate_model(true,predicted):
    RMSE=root_mean_squared_error(y_true=true,y_pred=predicted)
    MAE=mean_absolute_error(y_true=true,y_pred=predicted)
    R2=r2_score(y_true=true,y_pred=predicted)
    return MAE,RMSE,R2

In [9]:
models= {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "K Nearest Neighbors": KNeighborsRegressor(),
    "Xgboost": XGBRegressor(),
    "AdaBoost": AdaBoostRegressor()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train) # Train model

    ##model prediction
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 261.5525
- Mean Absolute Error: 175.2865
- R2 Score: 0.9339
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 261.6141
- Mean Absolute Error: 175.6267
- R2 Score: 0.9342


Lasso Regression
Model performance for Training set
- Root Mean Squared Error: 262.7144
- Mean Absolute Error: 175.9518
- R2 Score: 0.9333
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 261.5303
- Mean Absolute Error: 175.3532
- R2 Score: 0.9342


Ridge Regression
Model performance for Training set
- Root Mean Squared Error: 261.5539
- Mean Absolute Error: 175.2605
- R2 Score: 0.9339
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 261.5917
- Mean Absolute Error: 175.5946
- R2 Score: 0.9342


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2

In [10]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
6,Xgboost,0.974657
1,Lasso Regression,0.934214
2,Ridge Regression,0.934183
0,Linear Regression,0.934172
4,Random Forest,0.923352
3,Decision Tree,0.80151
7,AdaBoost,0.797441
5,K Nearest Neighbors,0.720301


### Interpretation & Insights
#### Linear, Lasso, Ridge
- Similar performance indicates your data has linear relationships.
- Great baseline models.
- Lasso slightly improved test performance â†’ suggests feature selection matters.

#### Decision Tree
- RÂ² = 1.0 on training, but low on test â†’ classic overfitting.
- Tree memorized data instead of learning patterns.

#### Random Forest
- Less overfitting than Decision Tree.
- High training score but good generalization.
- Good balance, could be improved with tuning.

#### KNN
Performs poorly â†’ dataset probably:
- has irrelevant distance patterns
- may not be normalized properly
- not suited for simple instance-based learning

#### XGBoost
- Best model: RÂ² = 0.97, low RMSE and MAE.
- Handles feature interactions, non-linear patterns, outliers well.
- Excellent generalization, no heavy overfitting.

#### Saving the best model

In [11]:
model=XGBRegressor()
model.fit(X_train,y_train)
y_test_pred=model.predict(X_test)
evaluate_model(y_test,y_test_pred)

(120.47949981689453, 162.32574462890625, 0.9746567010879517)

In [14]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBRegressor())
])

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
import pandas as pd
import joblib

df = pd.read_csv("../data/Cleaned_Carbon_Emission.csv")

X = df.drop(columns=["CarbonEmission"])
y = df["CarbonEmission"]

# Identify columns
num_cols = X.select_dtypes(exclude="object").columns.tolist()
cat_cols = X.select_dtypes(include="object").columns.tolist()

# BUILD preprocessor (do NOT apply it directly)
preprocessor = ColumnTransformer(
    [
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
        ("scaler", StandardScaler(with_mean=False), num_cols),
    ],
    remainder="drop"
)

# CREATE pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBRegressor())
])

# SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ðŸ‘‰ FIT ENTIRE PIPELINE ðŸš€
pipeline.fit(X_train, y_train)

# ðŸ‘‰ SAVE
joblib.dump(pipeline, "../artifacts/model.pkl")

print("ðŸŽ‰ Model saved successfully!")


ðŸŽ‰ Model saved successfully!
