In [1]:
# Import target.npy and train.npy
# Train a model using train.npy
# Test the model using target.npy

import numpy as np


# Import the data
train = np.load('dataset/train.npy')
target = np.load('dataset/target.npy')



## Linear Regression with KFold Cross Validation

In [2]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Define the number of folds for cross-validation
n_splits = 5

# Create an instance of KFold with the specified number of folds
kf = KFold(n_splits=n_splits)

# Initialize an empty list to store the root mean squared errors (RMSE) for each fold
rmse_scores = []

# Perform cross-validation
for train_index, test_index in kf.split(train):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of LinearRegression
    model = LinearRegression()
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = model.predict(X_test)
    
    # Calculate the RMSE for the current fold
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print the RMSE for the current fold
    print("RMSE:", rmse)
    
    # Append the RMSE to the list of scores
    rmse_scores.append(rmse)

# Calculate the average RMSE across all folds
average_rmse = np.mean(rmse_scores)

# Print the average RMSE
print("Average RMSE:", average_rmse)


RMSE: 1.906324519062298
RMSE: 1.811457932005012
RMSE: 1.856984817994169
RMSE: 1.8469065603446984
RMSE: 1.8518672416355342
Average RMSE: 1.8547082142083422


# XGBoost with KFold

In [3]:
# Import necessary modules
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

rmse_scores_xgboost = []

for train_index, test_index in kf.split(train):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of xgb.DMatrix for the training and testing sets
    xgb_train = xgb.DMatrix(X_train, y_train)
    xgb_test = xgb.DMatrix(X_test, y_test)

    # Define the parameters for the XGBoost model
    params = {
        'colsample_bytree': 0.8,                 
        'learning_rate': 0.1,
        'max_depth': 5,
        'subsample': 0.8,
        'objective': 'reg:squarederror',
    }

    # Train the XGBoost model
    model = xgb.train(params, xgb_train, num_boost_round=100)

    # Make predictions on the testing data
    y_pred = model.predict(xgb_test)

    # Calculate the RMSE for the current fold
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Print the RMSE for the current fold
    print("RMSE:", rmse)

    # Append the RMSE to the list of scores
    rmse_scores_xgboost.append(rmse)


# Calculate the average RMSE across all folds
average_rmse = np.mean(rmse_scores_xgboost)

# Print the average RMSE
print("Average RMSE:", average_rmse)


ModuleNotFoundError: No module named 'xgboost'

## LightGBM with KFold

In [4]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

rmse_scores_lightgbm = []

for train_index, test_index in kf.split(train):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of lgb.Dataset for the training and testing sets
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_test = lgb.Dataset(X_test, y_test)

    # Define the parameters for the LightGBM model
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }

    # Train the LightGBM model
    model = lgb.train(params, lgb_train, num_boost_round=100)

    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Calculate the RMSE for the current fold
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Print the RMSE for the current fold
    print("RMSE:", rmse)

    # Append the RMSE to the list of scores
    rmse_scores_lightgbm.append(rmse)


# Calculate the average RMSE across all folds
average_rmse = np.mean(rmse_scores_lightgbm)

# Print the average RMSE
print("Average RMSE:", average_rmse)


ModuleNotFoundError: No module named 'lightgbm'

Ridge Regression

In [6]:
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

# Define the number of folds for cross-validation
n_splits = 5

# Create an instance of KFold with the specified number of folds
kf = KFold(n_splits=n_splits)

# Initialize an empty list to store the RMSE for each fold
ridge_rmse_scores = []

# Specify the regularization strength for Ridge regression
alpha_ridge = 1.0

# Perform cross-validation
for train_index, test_index in kf.split(train):
    # Split the data
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of Ridge regression
    ridge_model = Ridge(alpha=alpha_ridge)
    
    # Fit the model
    ridge_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = ridge_model.predict(X_test)
    
    # Calculate RMSE and append to the list
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Ridge RMSE:", rmse)
    ridge_rmse_scores.append(rmse)

# Calculate the average RMSE
average_ridge_rmse = np.mean(ridge_rmse_scores)
print("Average Ridge RMSE:", average_ridge_rmse)


Ridge RMSE: 1.9063042351818682
Ridge RMSE: 1.8112543823578533
Ridge RMSE: 1.8569727958410416
Ridge RMSE: 1.8469362815762222
Ridge RMSE: 1.8518775569484203
Average Ridge RMSE: 1.8546690503810812


In [None]:
from sklearn.linear_model import Lasso

# Initialize an empty list for Lasso RMSE scores
lasso_rmse_scores = []

# Specify the regularization strength for Lasso regression
alpha_lasso = 1.0

# Perform cross-validation
for train_index, test_index in kf.split(train):
    # Split the data
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    # Create an instance of Lasso regression
    lasso_model = Lasso(alpha=alpha_lasso)
    
    # Fit the model
    lasso_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = lasso_model.predict(X_test)
    
    # Calculate RMSE and append to the list
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Lasso RMSE:", rmse)
    lasso_rmse_scores.append(rmse)

# Calculate the average RMSE
average_lasso_rmse = np.mean(lasso_rmse_scores)
print("Average Lasso RMSE:", average_lasso_rmse)


Lasso RMSE: 2.1332342216645404
Lasso RMSE: 2.055400695762536
Lasso RMSE: 2.0796900553177444
