In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
X_train = pd.read_csv(r"../data/X_train.csv")
X_test = pd.read_csv(r"../data/X_test.csv")
y_train = pd.read_csv(r"../data/y_train.csv")
y_test = pd.read_csv(r"../data/y_test.csv")

In [3]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((59954, 92), (59954, 1), (14989, 92), (14989, 1))

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [5]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [6]:
## Beginning Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Adaboost Regressor":AdaBoostRegressor(),
   # "Graident BoostRegressor":GradientBoostingRegressor(),
    #"Xgboost Regressor":XGBRegressor()
   
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 28.9276
- Mean Absolute Error: 14.0904
- R2 Score: 0.8905
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 27.8526
- Mean Absolute Error: 14.1743
- R2 Score: 0.8984


Lasso
Model performance for Training set
- Root Mean Squared Error: 44.7685
- Mean Absolute Error: 30.7339
- R2 Score: 0.7378
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 45.0154
- Mean Absolute Error: 31.0201
- R2 Score: 0.7346


Ridge
Model performance for Training set
- Root Mean Squared Error: 43.6847
- Mean Absolute Error: 30.5221
- R2 Score: 0.7504
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 44.1744
- Mean Absolute Error: 30.8423
- R2 Score: 0.7444


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 49.1697
- Mean Absolute Error: 28.5445
- R2 Score: 0.6838
---------

  model.fit(X_train, y_train) # Train model


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 2.1256
- Mean Absolute Error: 1.1124
- R2 Score: 0.9994
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.5368
- Mean Absolute Error: 2.9691
- R2 Score: 0.9960




  y = column_or_1d(y, warn=True)


Adaboost Regressor
Model performance for Training set
- Root Mean Squared Error: 27.6869
- Mean Absolute Error: 25.0297
- R2 Score: 0.8997
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 27.8403
- Mean Absolute Error: 25.1405
- R2 Score: 0.8985




In [7]:
#Initialize few parameter for Hyperparamter tuning
knn_params = {"n_neighbors": [2, 3, 10, 20, 40, 50]}

In [8]:
# Models list for Hyperparameter tuning
randomcv_models = [('KNN', KNeighborsRegressor(), knn_params)]

In [9]:
# ##Hyperparameter Tuning
# from sklearn.model_selection import RandomizedSearchCV

# model_param = {}
# for name, model, params in randomcv_models:
#     random = RandomizedSearchCV(estimator=model,
#                                    param_distributions=params,
#                                    n_iter=100,
#                                    cv=3,
#                                    verbose=2,
#                                    n_jobs=-1)
#     random.fit(X_train, y_train)
#     model_param[name] = random.best_params_

# for model_name in model_param:
#     print(f"---------------- Best Params for {model_name} -------------------")
#     print(model_param[model_name])
    
    
    
from sklearn.model_selection import RandomizedSearchCV
import time

model_param = {}
n_iter = 100

for name, model, params in randomcv_models:
    print(f"\n🔍 Starting RandomizedSearchCV for: {name}")
    
    random = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=n_iter,
        cv=3,
        verbose=2,   # Set to 0 to manage output ourselves
        n_jobs=-1
    )

    # Track iterations manually using loop + Timer
    start = time.time()
    for i, (param_set) in enumerate(randomcv_models[0][2].items(), start=1):
        if i % 5 == 0:
            print(f"✅ {i} iterations completed for {name}...")

    random.fit(X_train, y_train)
    end = time.time()

    print(f"⏱️ Total time taken for {name}: {round(end - start, 2)} seconds")

    model_param[name] = random.best_params_

# Display best parameters
for model_name in model_param:
    print(f"\n---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])



🔍 Starting RandomizedSearchCV for: KNN
Fitting 3 folds for each of 6 candidates, totalling 18 fits




⏱️ Total time taken for KNN: 39.21 seconds

---------------- Best Params for KNN -------------------
{'n_neighbors': 10}


In [10]:
## Retraining the models with best parameters
models = {
    # "Random Forest Regressor": RandomForestRegressor(n_estimators=100, min_samples_split=2, max_features='auto', max_depth=None,  n_jobs=-1),
    "K-Neighbors Regressor": KNeighborsRegressor(n_neighbors=10, n_jobs=-1),
    #  "Adaboost":AdaBoostRegressor(n_estimators=60,loss='linear')
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 55.5129
- Mean Absolute Error: 32.6342
- R2 Score: 0.5969
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 61.4278
- Mean Absolute Error: 36.6144
- R2 Score: 0.5058




In [11]:
#Initialize few parameter for Hyperparamter tuning
knn_params = {"n_neighbors": [2, 3,7, 10,15, 20, 30,40, 50]}

In [12]:
# Models list for Hyperparameter tuning
grid_search_cv_models = [
                    ('KNN', KNeighborsRegressor(), knn_params),
                #    ("RF", RandomForestRegressor(), rf_params),
                #    ("Adaboost",AdaBoostRegressor(),ada_params)
                   
                   ]

In [14]:
# ##Hyperparameter Tuning
# from sklearn.model_selection import GridSearchCV

# model_param = {}
# for name, model, params in grid_search_cv_models:
#     grid = GridSearchCV(estimator=model,
#                                    param_grid=params,
#                                    cv=3,
#                                    verbose=2,
#                                    n_jobs=-1)
#     grid.fit(X_train, y_train)
#     model_param[name] = grid.best_params_

# for model_name in model_param:
#     print(f"---------------- Best Params for {model_name} -------------------")
#     print(model_param[model_name])
    
from sklearn.model_selection import GridSearchCV
import time

model_param = {}
# n_iter = 30

for name, model, params in randomcv_models:
    print(f"\n🔍 Starting RandomizedSearchCV for: {name}")
    
    random = GridSearchCV(
        estimator=model,
        param_grid=params,
        # n_iter=n_iter,
        cv=3,
        verbose=2,   # Set to 0 to manage output ourselves
        n_jobs=-1
    )

    # Track iterations manually using loop + Timer
    start = time.time()
    for i, (param_set) in enumerate(randomcv_models[0][2].items(), start=1):
        if i % 5 == 0:
            print(f"✅ {i} iterations completed for {name}...")

    random.fit(X_train, y_train)
    end = time.time()

    print(f"⏱️ Total time taken for {name}: {round(end - start, 2)} seconds")

    model_param[name] = random.best_params_

# Display best parameters
for model_name in model_param:
    print(f"\n---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])



🔍 Starting RandomizedSearchCV for: KNN
Fitting 3 folds for each of 6 candidates, totalling 18 fits
⏱️ Total time taken for KNN: 45.31 seconds

---------------- Best Params for KNN -------------------
{'n_neighbors': 10}


In [15]:
## Retraining the models with best parameters
models = {
    # "Random Forest Regressor": RandomForestRegressor(n_estimators=100, min_samples_split=2, max_features='auto', max_depth=None,  n_jobs=-1),
    "K-Neighbors Regressor": KNeighborsRegressor(n_neighbors=10, n_jobs=-1),
    #  "Adaboost":AdaBoostRegressor(n_estimators=60,loss='linear')
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 55.5129
- Mean Absolute Error: 32.6342
- R2 Score: 0.5969
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 61.4278
- Mean Absolute Error: 36.6144
- R2 Score: 0.5058




In [16]:

from sklearn.ensemble import AdaBoostRegressor
import pickle

# Create and train AdaBoost model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train.values.ravel())  # Flatten y if warning arises

# Save to a .pkl file
with open(r"../Saving The Model Results/knn_regressor_model.pkl", "wb") as file:
    pickle.dump(knn_model, file)


In [12]:
# from sklearn.neighbors import KNeighborsRegressor

In [13]:
# regressor=KNeighborsRegressor(n_neighbors=6,algorithm='auto')
# regressor.fit(X_train,y_train)

In [14]:
# y_pred=regressor.predict(X_test)

In [15]:
# from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [16]:
# print(r2_score(y_test,y_pred))
# print(mean_absolute_error(y_test,y_pred))
# print(mean_squared_error(y_test,y_pred))