# Modelling

## Regression 

#### Importing libraries and loading datasets

In [1]:
import pandas as pd

In [2]:
X_train_reg = pd.read_csv('X_train_reg')
X_test_reg = pd.read_csv('X_test_reg')
X_valid_reg = pd.read_csv('X_valid_reg')
X_train_reg_scaled = pd.read_csv('X_train_reg_scaled')
X_test_reg_scaled = pd.read_csv('X_test_reg_scaled')
X_valid_reg_scaled = pd.read_csv('X_valid_reg_scaled')

In [3]:
y_test_reg = pd.read_csv('y_test_reg')
y_train_reg = pd.read_csv('y_train_reg')
y_valid_reg = pd.read_csv('y_valid_reg')

In [4]:
# Libraries for Regression models 

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# Metric for Regression Models 

from sklearn.metrics import r2_score,mean_absolute_error,root_mean_squared_error

#### Linear Regression 

In [5]:
model_lr_reg = LinearRegression()

model_lr_reg.fit(X_train_reg_scaled,y_train_reg)

y_pred_valid_lr_reg = model_lr_reg.predict(X_valid_reg_scaled)

In [6]:
rmse_lr = root_mean_squared_error(y_valid_reg,y_pred_valid_lr_reg)
mae_lr = mean_absolute_error(y_valid_reg, y_pred_valid_lr_reg)
r2_lr = r2_score(y_valid_reg, y_pred_valid_lr_reg)

print(f"RMSE: {rmse_lr}")
print(f"MAE: {mae_lr}")
print(f"R2 Score: {r2_lr}")

RMSE: 4147.67540920938
MAE: 2969.537739431776
R2 Score: 0.715019650734272


#### Random Forest

In [7]:
model_rf_reg = RandomForestRegressor(n_estimators=50,n_jobs=-1,max_depth=15)

In [8]:
model_rf_reg.fit(X_train_reg,y_train_reg.values.ravel())

In [9]:
y_pred_valid_rf_reg = model_rf_reg.predict(X_valid_reg)

In [10]:
rmse_rf = root_mean_squared_error(y_valid_reg,y_pred_valid_rf_reg)
mae_rf = mean_absolute_error(y_valid_reg, y_pred_valid_rf_reg)
r2_rf = r2_score(y_valid_reg, y_pred_valid_rf_reg)

print(f"RMSE: {rmse_rf}")
print(f"MAE: {mae_rf}")
print(f"R2 Score: {r2_rf}")

RMSE: 1039.5903289653916
MAE: 426.81905267237454
R2 Score: 0.9820968334355876


#### XGBoost

In [11]:
# Now Trying XGBoost Regressor 
from xgboost import XGBRegressor

reg_model_xgb = XGBRegressor()

reg_model_xgb.fit(X_train_reg,y_train_reg)

y_pred_xgb_valid = reg_model_xgb.predict(X_valid_reg)

In [12]:
rmse_xgb = root_mean_squared_error(y_valid_reg,y_pred_xgb_valid)
mae_xgb = mean_absolute_error(y_valid_reg, y_pred_xgb_valid)
r2_xgb = r2_score(y_valid_reg, y_pred_xgb_valid)

print("Results for XGBoost : \n")
print(f"RMSE: {rmse_xgb}")
print(f"MAE: {mae_xgb}")
print(f"R2 Score: {r2_xgb}")

Results for XGBoost : 

RMSE: 792.5864868164062
MAE: 393.68853759765625
R2 Score: 0.9895936250686646


In [13]:
# Its a perfect score and seems that the data trained well, and its variance is introduced well on train as well as valid data.
# Let's try it on Test data

y_pred_test_xgb = reg_model_xgb.predict(X_test_reg)

In [14]:
rmse_xgb_test = root_mean_squared_error(y_test_reg,y_pred_test_xgb)
mae_xgb_test = mean_absolute_error(y_test_reg,y_pred_test_xgb)
r2_xgb_test = r2_score(y_test_reg,y_pred_test_xgb)

In [15]:
print("XGBoost result on Test data : \n")
print(f"RMSE: {rmse_xgb_test}")
print(f"MAE: {mae_xgb_test}")
print(f"R2 Score: {r2_xgb_test}")

XGBoost result on Test data : 

RMSE: 783.3439331054688
MAE: 392.4131774902344
R2 Score: 0.9898343682289124


#### Gradient Boosting 

In [16]:
reg_model_gb = GradientBoostingRegressor()

reg_model_gb.fit(X_train_reg,y_train_reg.values.ravel())

In [17]:
y_pred_gb_valid = reg_model_gb.predict(X_valid_reg)

In [18]:
rmse_gb = root_mean_squared_error(y_valid_reg,y_pred_gb_valid)
mae_gb = mean_absolute_error(y_valid_reg, y_pred_gb_valid)
r2_gb = r2_score(y_valid_reg, y_pred_gb_valid)


print("GradientBoost result on : \n")
print(f"RMSE: {rmse_gb}")
print(f"MAE: {mae_gb}")
print(f"R2 Score: {r2_gb}")

GradientBoost result on : 

RMSE: 1459.3781932465872
MAE: 800.7644404735481
R2 Score: 0.9647189829741739


In [19]:
# Its a perfect score and seems that the data trained well, and its variance is introduced well on train as well as valid data.
# Let's try it on Test data

y_pred_test_gb = reg_model_gb.predict(X_test_reg)


rmse_gb_test = root_mean_squared_error(y_test_reg,y_pred_test_gb)
mae_gb_test = mean_absolute_error(y_test_reg,y_pred_test_gb)
r2_gb_test = r2_score(y_test_reg,y_pred_test_gb)


print("radientGBoosting result on Test data : \n")
print(f"RMSE: {rmse_gb_test}")
print(f"MAE: {mae_gb_test}")
print(f"R2 Score: {r2_gb_test}")

radientGBoosting result on Test data : 

RMSE: 1439.3584458982705
MAE: 797.2546916964192
R2 Score: 0.9656784076292373


In [20]:
# Saving all the results of regression in dataframe 

results_reg = pd.DataFrame(columns=['Model','RMSE','MAE'])


new_row = pd.DataFrame([{
    'Model':'Linear Regression',
    'RMSE':rmse_lr,
    'MAE':mae_lr,
    'R2_Score':r2_lr
}])

results_reg = pd.concat([results_reg,new_row],ignore_index=True)

  results_reg = pd.concat([results_reg,new_row],ignore_index=True)


In [21]:
new_row_1 = pd.DataFrame([{
    'Model':'Random Forest',
    'RMSE':rmse_rf,
    'MAE':mae_rf,
    'R2_Score':r2_rf
}])

new_row_2 = pd.DataFrame([{
    'Model':'XGBoost',
    'RMSE':rmse_xgb,
    'MAE':mae_xgb,
    'R2_Score':r2_xgb
}])

new_row_3 = pd.DataFrame([{
    'Model':'XGBoost test_split',
    'RMSE':rmse_xgb_test,
    'MAE':mae_xgb_test,
    'R2_Score':r2_xgb_test
}])

new_row_4 = pd.DataFrame([{
    'Model':'Gradient Boosting',
    'RMSE':rmse_gb,
    'MAE':mae_gb,
    'R2_Score':r2_gb
}])


results_reg = pd.concat([results_reg,new_row_1,new_row_2,new_row_3,new_row_4],ignore_index=True)

In [22]:
new_row_5 = pd.DataFrame([{
    'Model':'Gradient Boosting on test',
    'RMSE':rmse_gb_test,
    'MAE':mae_gb_test,
    'R2_Score':r2_gb_test
}])

results_reg = pd.concat([results_reg,new_row_5],ignore_index=True)

In [23]:
results_reg

Unnamed: 0,Model,RMSE,MAE,R2_Score
0,Linear Regression,4147.675409,2969.537739,0.71502
1,Random Forest,1039.590329,426.819053,0.982097
2,XGBoost,792.586487,393.688538,0.989594
3,XGBoost test_split,783.343933,392.413177,0.989834
4,Gradient Boosting,1459.378193,800.76444,0.964719
5,Gradient Boosting on test,1439.358446,797.254692,0.965678


## Classification

#### Importing Libraries and datasets 

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [25]:
X_train_class = pd.read_csv('X_train_class')
X_test_class = pd.read_csv('X_test_class')
X_valid_class = pd.read_csv('X_valid_class')
X_train_class_scaled = pd.read_csv('X_train_class_scaled')
X_test_class_scaled = pd.read_csv('X_test_class_scaled')
X_valid_class_scaled = pd.read_csv('X_valid_class_scaled')

In [26]:
y_test_class = pd.read_csv('y_test_class')
y_train_class = pd.read_csv('y_train_class')
y_valid_class = pd.read_csv('y_valid_class')

#### Logistics Regression

In [27]:
class_model_lr = LogisticRegression(class_weight='balanced')

class_model_lr.fit(X_train_class_scaled,y_train_class)

  y = column_or_1d(y, warn=True)


In [28]:
y_pred_lr_valid_class = class_model_lr.predict(X_valid_class_scaled)

In [29]:
from sklearn.metrics import classification_report

print('Logistics Regression results : \n')
print(classification_report(y_valid_class,y_pred_lr_valid_class))

Logistics Regression results : 

              precision    recall  f1-score   support

    Eligible       0.82      0.73      0.78     11168
   High_Risk       0.14      0.66      0.23      2557
Not_Eligible       0.98      0.80      0.89     46731

    accuracy                           0.79     60456
   macro avg       0.65      0.73      0.63     60456
weighted avg       0.92      0.79      0.84     60456



#### Random Forest

In [30]:
class_model_rf = RandomForestClassifier(n_estimators=50,n_jobs=-1,max_depth=15,class_weight='balanced')

class_model_rf.fit(X_train_class,y_train_class)

  return fit_method(estimator, *args, **kwargs)


In [31]:
y_pred_rf_valid_class = class_model_rf.predict(X_valid_class)

In [32]:
print('Random Forest results : \n')
print(classification_report(y_valid_class,y_pred_rf_valid_class))

Random Forest results : 

              precision    recall  f1-score   support

    Eligible       0.86      0.85      0.86     11168
   High_Risk       0.22      0.52      0.31      2557
Not_Eligible       0.98      0.91      0.94     46731

    accuracy                           0.88     60456
   macro avg       0.69      0.76      0.70     60456
weighted avg       0.93      0.88      0.90     60456



#### XGBoost

In [33]:
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
le = LabelEncoder()

# Fit on all data (train + valid + test) to ensure consistent mapping
le.fit(y_train_class)

# Transform the labels
y_train_class_enc = le.transform(y_train_class)
y_valid_class_enc = le.transform(y_valid_class)
y_test_class_enc  = le.transform(y_test_class)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [34]:
class_model_xgb = XGBClassifier()

class_model_xgb.fit(X_train_class,y_train_class_enc)

In [35]:
y_pred_xgb_valid_class = class_model_xgb.predict(X_valid_class)

In [36]:
print('XGBoost results : \n')
print(classification_report(y_valid_class_enc,y_pred_xgb_valid_class))

XGBoost results : 

              precision    recall  f1-score   support

           0       0.93      0.98      0.95     11168
           1       0.79      0.31      0.45      2557
           2       0.98      1.00      0.99     46731

    accuracy                           0.96     60456
   macro avg       0.90      0.76      0.80     60456
weighted avg       0.96      0.96      0.96     60456



In [37]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [38]:
acc_lr = accuracy_score(y_valid_class,y_pred_lr_valid_class)
pre_lr = precision_score(y_valid_class,y_pred_lr_valid_class,average='macro')
rec_lr = recall_score(y_valid_class,y_pred_lr_valid_class,average='macro')
f1_lr = f1_score(y_valid_class,y_pred_lr_valid_class,average='macro')

In [39]:
results_class = pd.DataFrame(columns=['Accuracy','Precision','Recall','F1'])

new_row = pd.DataFrame([{
    'Model':'Logistic Regression',
    'Accuracy':acc_lr,
    'Precision':pre_lr,
    'Recall':rec_lr,
    'F1':f1_lr
}])

results_class = pd.concat([results_class,new_row],ignore_index=True)

  results_class = pd.concat([results_class,new_row],ignore_index=True)


In [40]:
results_class

Unnamed: 0,Accuracy,Precision,Recall,F1,Model
0,0.785596,0.648205,0.733686,0.629772,Logistic Regression


In [41]:
acc_rf = accuracy_score(y_valid_class,y_pred_rf_valid_class)
pre_rf = precision_score(y_valid_class,y_pred_rf_valid_class,average='macro')
rec_rf = recall_score(y_valid_class,y_pred_rf_valid_class,average='macro')
f1_rf = f1_score(y_valid_class,y_pred_rf_valid_class,average='macro')

In [42]:
acc_xgb = accuracy_score(y_valid_class_enc,y_pred_xgb_valid_class)
pre_xgb = precision_score(y_valid_class_enc,y_pred_xgb_valid_class,average='macro')
rec_xgb = recall_score(y_valid_class_enc,y_pred_xgb_valid_class,average='macro')
f1_xgb = f1_score(y_valid_class_enc,y_pred_xgb_valid_class,average='macro')

In [43]:
new_row_1 = pd.DataFrame([{
    'Model':'Random Forest',
    'Accuracy':acc_rf,
    'Precision':pre_rf,
    'Recall':rec_rf,
    'F1':f1_rf
}])

new_row_2 = pd.DataFrame([{
    'Model':'XGBoost',
    'Accuracy':acc_xgb,
    'Precision':pre_xgb,
    'Recall':rec_xgb,
    'F1':f1_xgb
}])

results_class = pd.concat([results_class,new_row_1,new_row_2],ignore_index=True)

In [44]:
results_class

Unnamed: 0,Accuracy,Precision,Recall,F1,Model
0,0.785596,0.648205,0.733686,0.629772,Logistic Regression
1,0.883899,0.686989,0.760025,0.703918,Random Forest
2,0.964387,0.89949,0.762274,0.796113,XGBoost


In [45]:
# trying on test data for for xgboost and random forest 

y_pred_rf_test_class = class_model_rf.predict(X_test_class)

In [46]:
acc_rf_test = accuracy_score(y_test_class,y_pred_rf_test_class)
pre_rf_test = precision_score(y_test_class,y_pred_rf_test_class,average='macro')
rec_rf_test = recall_score(y_test_class,y_pred_rf_test_class,average='macro')
f1_rf_test = f1_score(y_test_class,y_pred_rf_test_class,average='macro')

In [47]:
print(acc_rf_test)
print(pre_rf_test)
print(rec_rf_test)
print(f1_rf_test)

0.8844484928784365
0.6856031393095866
0.7617506067091117
0.7031579025663698


In [48]:
# For XGBoost

y_pred_xgb_test_class = class_model_xgb.predict(X_test_class)

acc_xgb_test = accuracy_score(y_test_class_enc,y_pred_xgb_test_class)
pre_xgb_test = precision_score(y_test_class_enc,y_pred_xgb_test_class,average='macro')
rec_xgb_test = recall_score(y_test_class_enc,y_pred_xgb_test_class,average='macro')
f1_xgb_test = f1_score(y_test_class_enc,y_pred_xgb_test_class,average='macro')

In [49]:
new_row_3 = pd.DataFrame([{
    'Model':'Random Forest (test split)',
    'Accuracy':acc_rf_test,
    'Precision':pre_rf_test,
    'Recall':rec_rf_test,
    'F1':f1_rf_test
}])

new_row_4 = pd.DataFrame([{
    'Model':'XGBoost (test split)',
    'Accuracy':acc_xgb_test,
    'Precision':pre_xgb_test,
    'Recall':rec_xgb_test,
    'F1':f1_xgb_test
}])


results_class = pd.concat([results_class,new_row_3,new_row_4],ignore_index=True)

In [50]:
results_class

Unnamed: 0,Accuracy,Precision,Recall,F1,Model
0,0.785596,0.648205,0.733686,0.629772,Logistic Regression
1,0.883899,0.686989,0.760025,0.703918,Random Forest
2,0.964387,0.89949,0.762274,0.796113,XGBoost
3,0.884448,0.685603,0.761751,0.703158,Random Forest (test split)
4,0.963746,0.88413,0.751122,0.781796,XGBoost (test split)


## MLFlow 

### Regression 

In [51]:
import mlflow
import mlflow.sklearn

In [56]:
mlflow.set_experiment('EMI_Predictions')
mlflow.set_tracking_uri('http://127.0.0.1:5000/')

2025/11/11 17:27:30 INFO mlflow.tracking.fluent: Experiment with name 'EMI_Predictions' does not exist. Creating a new experiment.


In [57]:
with mlflow.start_run(run_name="Linear_Regression"):
    mlflow.log_param("model_type","LinearRegression")
    mlflow.log_metric("RMSE",rmse_lr)
    mlflow.log_metric("MAE",mae_lr)
    mlflow.log_metric('R2_score',r2_lr)

    mlflow.sklearn.log_model(model_lr_reg,artifact_path="model")

print("Linear Regression model logged successfully")



🏃 View run Linear_Regression at: http://127.0.0.1:5000/#/experiments/957128874882554580/runs/6a7a978ee1ca4ee5b481aca14f64208a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/957128874882554580
Linear Regression model logged successfully


In [58]:
with mlflow.start_run(run_name="Random_Forest_Regressor"):
    mlflow.log_param("model_type","RandomForestRegressor")
    mlflow.log_metric("RMSE",rmse_rf)
    mlflow.log_metric("MAE",mae_rf)
    mlflow.log_metric("R2_score",r2_rf)

    mlflow.sklearn.log_model(model_rf_reg,artifact_path='model')

print('Random Forest Regressor model logged successfully')



🏃 View run Random_Forest_Regressor at: http://127.0.0.1:5000/#/experiments/957128874882554580/runs/35ce2954dc3e49c0b6b0995ec56e99a9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/957128874882554580
Random Forest Regressor model logged successfully


In [59]:
with mlflow.start_run(run_name="Gradient_Boost"):
    mlflow.log_param('model_type','GradientBoostingRegressor')

    # Validation Metrics
    mlflow.log_metric("RMSE",rmse_gb)
    mlflow.log_metric("MAE",mae_gb)
    mlflow.log_metric("R2_score",r2_gb)

    # Test Metrics
    mlflow.log_metric("RMSE_test",rmse_gb_test)
    mlflow.log_metric("MAE_test",mae_gb_test)
    mlflow.log_metric("R2_score_test",r2_gb_test)

    mlflow.sklearn.log_model(reg_model_gb,artifact_path='model')

print('Random Forest Regressor model logged successfully')



🏃 View run Gradient_Boost at: http://127.0.0.1:5000/#/experiments/957128874882554580/runs/e85cc11351e240b5a89020137d63c1d4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/957128874882554580
Random Forest Regressor model logged successfully


In [60]:
with mlflow.start_run(run_name="XGBoost"):
    mlflow.log_param('model_type','XGBRegressor')
   
    # Validation metrics
    mlflow.log_metric("RMSE",rmse_xgb)
    mlflow.log_metric("MAE",mae_xgb)
    mlflow.log_metric("R2_score",r2_xgb)

    # Test Metrics 
    mlflow.log_metric("RMSE_test",rmse_xgb_test)
    mlflow.log_metric("MAE_test",mae_xgb_test)
    mlflow.log_metric("R2_score_test",r2_xgb_test)

    mlflow.sklearn.log_model(reg_model_xgb,artifact_path='model')

print('Random Forest Regressor model logged successfully')



🏃 View run XGBoost at: http://127.0.0.1:5000/#/experiments/957128874882554580/runs/a18c79abd4c140dfa92b7a930250623d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/957128874882554580
Random Forest Regressor model logged successfully


### Classification

In [61]:
with mlflow.start_run(run_name="Logistic_Regression"):
    mlflow.log_param("model_type","LogisticRegression")
    mlflow.log_metric("Accuracy",acc_lr)
    mlflow.log_metric("Precision",pre_lr)
    mlflow.log_metric('Recall',rec_lr)
    mlflow.log_metric('F1_score',f1_lr)

    mlflow.sklearn.log_model(class_model_lr,artifact_path="model")

print("Logistic Regression model logged successfully")



🏃 View run Logistic_Regression at: http://127.0.0.1:5000/#/experiments/957128874882554580/runs/43f59875a8c145289f49ef59daed41a4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/957128874882554580
Logistic Regression model logged successfully


In [62]:
with mlflow.start_run(run_name="XGBoost_Classifier"):
    mlflow.log_param("model_type","XGBClassifier")

    # Validation matrics
    mlflow.log_metric("Accuracy",acc_xgb)
    mlflow.log_metric("Precision",pre_xgb)
    mlflow.log_metric('Recall',rec_xgb)
    mlflow.log_metric('F1_score',f1_xgb)

    # Test metrics
    mlflow.log_metric("Accuracy_test",acc_xgb_test)
    mlflow.log_metric("Precision_test",pre_xgb_test)
    mlflow.log_metric('Recall_test',rec_xgb_test)
    mlflow.log_metric('F1_score_test',f1_xgb_test)

    mlflow.sklearn.log_model(class_model_xgb,artifact_path="model")

print("XGBoost Classifier model logged successfully")



🏃 View run XGBoost_Classifier at: http://127.0.0.1:5000/#/experiments/957128874882554580/runs/1622af243ac04064b89e79989ab1c57d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/957128874882554580
XGBoost Classifier model logged successfully


In [63]:
with mlflow.start_run(run_name="Random_Forest_Classifier"):
    mlflow.log_param("model_type","RandomForestClassifier")

    # Validation matrics
    mlflow.log_metric("Accuracy",acc_rf)
    mlflow.log_metric("Precision",pre_rf)
    mlflow.log_metric('Recall',rec_rf)
    mlflow.log_metric('F1_score',f1_rf)

    # Test metrics
    mlflow.log_metric("Accuracy_test",acc_rf_test)
    mlflow.log_metric("Precision_test",pre_rf_test)
    mlflow.log_metric('Recall_test',rec_rf_test)
    mlflow.log_metric('F1_score_test',f1_rf_test)

    mlflow.sklearn.log_model(class_model_rf,artifact_path="model")

print("XGBoost Classifier model logged successfully")



🏃 View run Random_Forest_Classifier at: http://127.0.0.1:5000/#/experiments/957128874882554580/runs/797284e9399d4b1a8a75d5d9d2dfa3c9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/957128874882554580
XGBoost Classifier model logged successfully
