# Experimentation with All Models using Processed Data

In [1]:
# Import target.npy and train.npy
# Train a model using train.npy
# Test the model using target.npy

import numpy as np


# Import the data
train = np.load('dataset/train.npy')
target = np.load('dataset/target.npy')

## Linear Regression with KFold Cross Validation

In [2]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Define the number of folds for cross-validation
n_splits = 5

# Create an instance of KFold with the specified number of folds
kf = KFold(n_splits=n_splits)

# Initialize an empty list to store the root mean squared errors (RMSE) for each fold
rmse_scores = []

# Perform cross-validation
for train_index, test_index in kf.split(train):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of LinearRegression
    model = LinearRegression()
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = model.predict(X_test)
    
    # Calculate the RMSE for the current fold
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print the RMSE for the current fold
    print("RMSE:", rmse)
    
    # Append the RMSE to the list of scores
    rmse_scores.append(rmse)

# Calculate the average RMSE across all folds
average_rmse = np.mean(rmse_scores)

# Print the average RMSE
print("Average RMSE:", average_rmse)

RMSE: 1.9063245190622917
RMSE: 1.8114579320050195
RMSE: 1.856984817994204
RMSE: 1.8469065603446553
RMSE: 1.8518672416355286
Average RMSE: 1.8547082142083398


# XGBoost with KFold

In [3]:
# Import necessary modules
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

rmse_scores_xgboost = []

for train_index, test_index in kf.split(train):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of xgb.DMatrix for the training and testing sets
    xgb_train = xgb.DMatrix(X_train, y_train)
    xgb_test = xgb.DMatrix(X_test, y_test)

    # Define the parameters for the XGBoost model
    params = {
        'colsample_bytree': 0.8,                 
        'learning_rate': 0.1,
        'max_depth': 5,
        'subsample': 0.8,
        'objective': 'reg:squarederror',
    }

    # Train the XGBoost model
    model = xgb.train(params, xgb_train, num_boost_round=100)

    # Make predictions on the testing data
    y_pred = model.predict(xgb_test)

    # Calculate the RMSE for the current fold
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Print the RMSE for the current fold
    print("RMSE:", rmse)

    # Append the RMSE to the list of scores
    rmse_scores_xgboost.append(rmse)


# Calculate the average RMSE across all folds
average_rmse = np.mean(rmse_scores_xgboost)

# Print the average RMSE
print("Average RMSE:", average_rmse)

RMSE: 1.4121623
RMSE: 1.3773588
RMSE: 1.4391705
RMSE: 1.4573854
RMSE: 1.3527637
Average RMSE: 1.4077681


## LightGBM with KFold

In [4]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

rmse_scores_lightgbm = []

for train_index, test_index in kf.split(train):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of lgb.Dataset for the training and testing sets
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_test = lgb.Dataset(X_test, y_test)

#     # Define the parameters for the LightGBM model
#     params = {
#     'objective': 'regression',
#     'metric': 'rmse',
#     'num_leaves': 100,
#     'learning_rate': 0.09841079471843048,
#     'feature_fraction': 0.6146295376710438,
#     'bagging_fraction': 0.6360723189013848,
#     'bagging_freq': 7,
#     'verbose': 0
# }
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }

    # Train the LightGBM model
    model = lgb.train(params, lgb_train, num_boost_round=100)

    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Calculate the RMSE for the current fold
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Print the RMSE for the current fold
    print("RMSE:", rmse)

    # Append the RMSE to the list of scores
    rmse_scores_lightgbm.append(rmse)


# Calculate the average RMSE across all folds
average_rmse = np.mean(rmse_scores_lightgbm)

# Print the average RMSE
print("Average RMSE:", average_rmse)

RMSE: 1.4360894297773417
RMSE: 1.3978521913589237
RMSE: 1.4722209435886482
RMSE: 1.4877702199875382
RMSE: 1.3510469637861724
Average RMSE: 1.4289959496997249


## Ridge Regression with k fold 

In [5]:
from sklearn.linear_model import Ridge

# Define the number of folds for cross-validation
n_splits = 5

# Create an instance of KFold with the specified number of folds
kf = KFold(n_splits=n_splits)

# Initialize an empty list to store the RMSE for each fold
ridge_rmse_scores = []

# Specify the regularization strength for Ridge regression
alpha_ridge = 1.0

# Perform cross-validation
for train_index, test_index in kf.split(train):
    # Split the data
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of Ridge regression
    ridge_model = Ridge(alpha=alpha_ridge)
    
    # Fit the model
    ridge_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = ridge_model.predict(X_test)
    
    # Calculate RMSE and append to the list
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Ridge RMSE:", rmse)
    ridge_rmse_scores.append(rmse)

# Calculate the average RMSE
average_ridge_rmse = np.mean(ridge_rmse_scores)
print("Average Ridge RMSE:", average_ridge_rmse)

Ridge RMSE: 1.9063042351818684
Ridge RMSE: 1.8112543823578529
Ridge RMSE: 1.8569727958410456
Ridge RMSE: 1.846936281576231
Ridge RMSE: 1.8518775569484234
Average Ridge RMSE: 1.8546690503810843


## Lasso Regression with K-fold 

In [6]:
from sklearn.linear_model import Lasso

# Initialize an empty list for Lasso RMSE scores
lasso_rmse_scores = []

# Specify the regularization strength for Lasso regression
alpha_lasso = 1.0

# Perform cross-validation
for train_index, test_index in kf.split(train):
    # Split the data
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of Lasso regression
    lasso_model = Lasso(alpha=alpha_lasso)
    
    # Fit the model
    lasso_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = lasso_model.predict(X_test)
    
    # Calculate RMSE and append to the list
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Lasso RMSE:", rmse)
    lasso_rmse_scores.append(rmse)

# Calculate the average RMSE
average_lasso_rmse = np.mean(lasso_rmse_scores)
print("Average Lasso RMSE:", average_lasso_rmse)

Lasso RMSE: 2.1332342216645404
Lasso RMSE: 2.055400695762536
Lasso RMSE: 2.0796900553177444
Lasso RMSE: 2.0921591642184496
Lasso RMSE: 2.080921801239417
Average Lasso RMSE: 2.0882811876405376


## Desicion tree with K-fold

In [7]:
from sklearn.tree import DecisionTreeRegressor

# Define the number of folds for cross-validation
n_splits = 5

# Create an instance of KFold with the specified number of folds
kf = KFold(n_splits=n_splits)

# Initialize an empty list to store the RMSE for each fold
dt_rmse_scores = []

# Perform cross-validation
for train_index, test_index in kf.split(train):
    # Split the data
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of DecisionTreeRegressor
    # Adjust max_depth, min_samples_split, and/or min_samples_leaf as needed
    dt_model = DecisionTreeRegressor(max_depth=None) # Use default values or adjust as necessary
    
    # Fit the model
    dt_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = dt_model.predict(X_test)
    
    # Calculate RMSE and append to the list
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Decision Tree RMSE:", rmse)
    dt_rmse_scores.append(rmse)

# Calculate the average RMSE
average_dt_rmse = np.mean(dt_rmse_scores)
print("Average Decision Tree RMSE:", average_dt_rmse)

Decision Tree RMSE: 1.345699841219152
Decision Tree RMSE: 1.4083761384690157
Decision Tree RMSE: 1.3400278448508927
Decision Tree RMSE: 1.4701276760132027
Decision Tree RMSE: 1.389838093577145
Average Decision Tree RMSE: 1.3908139188258817


## Random forest with k Fold -- TOO long to run. 30 min+

In [8]:
# from sklearn.model_selection import KFold
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error
# import numpy as np

# # Define the number of folds for cross-validation
# n_splits = 5

# # Create an instance of KFold with the specified number of folds
# kf = KFold(n_splits=n_splits)

# # Initialize an empty list to store the RMSE for each fold
# random_forest_rmse_scores = []

# # Perform cross-validation
# for train_index, test_index in kf.split(train):
#     # Split the data into training and testing sets for the current fold
#     X_train, X_test = train[train_index], train[test_index]
#     y_train, y_test = target[train_index], target[test_index]
    
#     # Create an instance of the RandomForestRegressor
#     random_forest_model = RandomForestRegressor(n_estimators=100)  # You can adjust the number of trees
    
#     # Fit the model to the training data
#     random_forest_model.fit(X_train, y_train)
    
#     # Make predictions on the testing data
#     y_pred = random_forest_model.predict(X_test)
    
#     # Calculate the RMSE for the current fold
#     rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
#     # Print the RMSE for the current fold
#     print("Random Forest RMSE:", rmse)
    
#     # Append the RMSE to the list of scores
#     random_forest_rmse_scores.append(rmse)# Calculate the average RMSE across all folds
# average_random_forest_rmse = np.mean(random_forest_rmse_scores)

# # Print the average RMSE
# print("Average Random Forest RMSE:", average_random_forest_rmse)

## KNN

In [9]:
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Define the number of folds for cross-validation
n_splits = 5

# Create an instance of KFold with the specified number of folds
kf = KFold(n_splits=n_splits)

# Initialize an empty list to store the root mean squared errors (RMSE) for each fold
rmse_scores_knn = []

# Perform cross-validation for k-Nearest Neighbors (KNN)
for train_index, test_index in kf.split(train):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of k-Nearest Neighbors (KNN) Regressor
    knn_model = KNeighborsRegressor(n_neighbors=5)  # You can change the number of neighbors as needed
    
    # Fit the model to the training data
    knn_model.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred_knn = knn_model.predict(X_test)
    
    # Calculate the RMSE for the current fold for KNN
    rmse_knn = np.sqrt(mean_squared_error(y_test, y_pred_knn))
    
    # Print the RMSE for the current fold for KNN
    print("KNN RMSE:", rmse_knn)
    
    # Append the RMSE to the list of scores for KNN
    rmse_scores_knn.append(rmse_knn)

# Calculate the average RMSE across all folds for KNN
average_rmse_knn = np.mean(rmse_scores_knn)

# Print the average RMSE for KNN
print("Average KNN RMSE:", average_rmse_knn)

KNN RMSE: 1.8216407
KNN RMSE: 1.806594
KNN RMSE: 1.8060709
KNN RMSE: 1.8160338
KNN RMSE: 1.7953608
Average KNN RMSE: 1.80914


## CatBoost

In [10]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

# Define the number of folds for cross-validation
n_splits = 5

# Create an instance of KFold with the specified number of folds
kf = KFold(n_splits=n_splits)

# Initialize an empty list to store the RMSE for each fold
catboost_rmse_scores = []

# Specify the parameters for the CatBoost model
params = {
    'iterations': 100,
    'learning_rate': 0.1,
    'depth': 10,
    'loss_function': 'RMSE'
}

# Perform cross-validation
for train_index, test_index in kf.split(train):
    # Split the data
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of CatBoostRegressor
    model = CatBoostRegressor(**params)
    
    # Fit the model
    model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=100)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate RMSE and append to the list
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("CatBoost RMSE:", rmse)
    catboost_rmse_scores.append(rmse)

# Calculate the average RMSE
average_catboost_rmse = np.mean(catboost_rmse_scores)
print("Average CatBoost RMSE:", average_catboost_rmse)


0:	learn: 2.0003244	test: 2.0472585	best: 2.0472585 (0)	total: 2.44s	remaining: 4m 1s
99:	learn: 1.2659847	test: 1.3011109	best: 1.3011109 (99)	total: 3m	remaining: 0us

bestTest = 1.301110861
bestIteration = 99

CatBoost RMSE: 1.3011108608183484
0:	learn: 2.0152328	test: 1.9746628	best: 1.9746628 (0)	total: 1.52s	remaining: 2m 31s
99:	learn: 1.2670117	test: 1.2884159	best: 1.2884159 (99)	total: 3m 11s	remaining: 0us

bestTest = 1.288415886
bestIteration = 99

CatBoost RMSE: 1.2884158860782338
0:	learn: 2.0067580	test: 2.0178050	best: 2.0178050 (0)	total: 1.53s	remaining: 2m 31s
99:	learn: 1.2596054	test: 1.3373782	best: 1.3373782 (99)	total: 3m 22s	remaining: 0us

bestTest = 1.337378205
bestIteration = 99

CatBoost RMSE: 1.3373782047030436
0:	learn: 2.0040895	test: 2.0263057	best: 2.0263057 (0)	total: 2.11s	remaining: 3m 29s
99:	learn: 1.2389128	test: 1.3779281	best: 1.3776739 (98)	total: 3m 13s	remaining: 0us

bestTest = 1.377673873
bestIteration = 98

Shrink model to first 99 iterat

## RNN Neural Network

In [2]:
import tensorflow.keras.backend as K
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout, Activation
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.utils import plot_model
from keras.models import load_model
import tensorflow.keras.backend as K
from graphviz import Digraph
from sklearn.model_selection import train_test_split

def NN_RMSLE(y_actual, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_actual))) 

train_xx, val_xx, train_yy, val_yy = train_test_split(train, target, test_size=0.2, random_state=42)

model = Sequential()

earlyStop= EarlyStopping(monitor='val_loss', mode='min', patience=3)

model.add(layers.Dense(288, activation='relu',input_shape=(train_xx.shape[1],)))
model.add(Dropout(0))
model.add(layers.Dense(224, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.Dense(96, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(0  ,activation='linear'))

model.compile(optimizer='adam', loss = NN_RMSLE)

model.fit(train_xx, train_yy, epochs = 15, batch_size = 2048, validation_data=(val_xx,val_yy),callbacks = earlyStop)
print('Neural Network Training RMSLE = ', model.evaluate(train, target, verbose=0))
model.save('basic_3layer.keras')
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m1966/7755[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1:02[0m 11ms/step - loss: 2.0661

KeyboardInterrupt: 

In [None]:
def NN_RMSLE(y_actual, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_actual))) 
model_nn = load_model('basic_3layer.keras',custom_objects={'NN_RMSLE' : NN_RMSLE })
plot_model(model_nn, to_file='model.png', show_shapes=True)
