In [1]:
from sklearn.linear_model import Ridge
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [2]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# Predict the World Bank Dataset without PCA

In [3]:
X_train = pd.read_csv('Datasets/World Bank Transformed/X_train without PCA.csv').values
y_train = pd.read_csv('Datasets/World Bank Transformed/y_train.csv').values
X_test = pd.read_csv('Datasets/World Bank Transformed/X_test without PCA.csv').values
y_test = pd.read_csv('Datasets/World Bank Transformed/y_test.csv').values

ridge_model = Ridge()
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

In [4]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'alpha': 1}


In [6]:
y_pred = grid_search.best_estimator_.predict(X_test)
WB_MAE = metrics.mean_absolute_error(y_test, y_pred)
WB_MSE = metrics.mean_squared_error(y_test, y_pred)
print('Mean Absolute Error:', WB_MAE)
print('\nMean Squared Error:', WB_MSE) 

WB_R2 = grid_search.best_score_
print('\nR^2 Score:', WB_R2, '\n')

Mean Absolute Error: 0.7307271265252194

Mean Squared Error: 0.9029615120071685

R^2 Score: 0.9881735440097525 



# Predict the World Bank Dataset with PCA

In [7]:
X_train = pd.read_csv('Datasets/World Bank Transformed/X_train with PCA.csv').values
y_train = pd.read_csv('Datasets/World Bank Transformed/y_train.csv').values
X_test = pd.read_csv('Datasets/World Bank Transformed/X_test with PCA.csv').values
y_test = pd.read_csv('Datasets/World Bank Transformed/y_test.csv').values

ridge_model = Ridge()
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

In [8]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'alpha': 10}


In [9]:
y_pred = grid_search.best_estimator_.predict(X_test)
WB_PCA_MAE = metrics.mean_absolute_error(y_test, y_pred)
WB_PCA_MSE = metrics.mean_squared_error(y_test, y_pred)
print('Mean Absolute Error:', WB_PCA_MAE)
print('\nMean Squared Error:', WB_PCA_MSE) 

WB_PCA_R2 = grid_search.best_score_
print('\nR^2 Score:', WB_PCA_R2, '\n')

Mean Absolute Error: 1.7240190357544427

Mean Squared Error: 5.1752607646487245

R^2 Score: 0.9413900555955308 



# Predict the Kaggle Dataset without PCA

In [10]:
X_train = pd.read_csv('Datasets/Kaggle Transformed/X_train without PCA.csv').values
y_train = pd.read_csv('Datasets/Kaggle Transformed/y_train.csv').values
X_test = pd.read_csv('Datasets/Kaggle Transformed/X_test without PCA.csv').values
y_test = pd.read_csv('Datasets/Kaggle Transformed/y_test.csv').values

ridge_model = Ridge()
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

In [11]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'alpha': 10}


In [12]:
y_pred = grid_search.best_estimator_.predict(X_test)
KG_MAE = metrics.mean_absolute_error(y_test, y_pred)
KG_MSE = metrics.mean_squared_error(y_test, y_pred)
print('Mean Absolute Error:', KG_MAE)
print('\nMean Squared Error:', KG_MSE) 

KG_R2 = grid_search.best_score_
print('\nR^2 Score:', KG_R2, '\n')

Mean Absolute Error: 2.5266687678587343

Mean Squared Error: 11.723982680522212

R^2 Score: 0.8454026928032714 



# Predict the Kaggle Dataset with PCA

In [13]:
X_train = pd.read_csv('Datasets/Kaggle Transformed/X_train with PCA.csv').values
y_train = pd.read_csv('Datasets/Kaggle Transformed/y_train.csv').values
X_test = pd.read_csv('Datasets/Kaggle Transformed/X_test with PCA.csv').values
y_test = pd.read_csv('Datasets/Kaggle Transformed/y_test.csv').values

ridge_model = Ridge()
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

In [14]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'alpha': 10}


In [15]:
y_pred = grid_search.best_estimator_.predict(X_test)
KG_PCA_MAE = metrics.mean_absolute_error(y_test, y_pred)
KG_PCA_MSE = metrics.mean_squared_error(y_test, y_pred)
print('Mean Absolute Error:', KG_PCA_MAE)
print('\nMean Squared Error:', KG_PCA_MSE) 

KG_PCA_R2 = grid_search.best_score_
print('\nR^2 Score:', KG_PCA_R2, '\n')

Mean Absolute Error: 2.639406216451256

Mean Squared Error: 12.566200767801613

R^2 Score: 0.8275556504677853 



In [16]:
data = {
    'World Bank without PCA': [WB_MSE, WB_MAE, WB_R2],
    'World Bank with PCA': [WB_PCA_MSE, WB_PCA_MAE, WB_PCA_R2],
    'Kaggle without PCA': [KG_MSE, KG_MAE, KG_R2],
    'Kaggle with PCA': [KG_PCA_MSE, KG_PCA_MAE, KG_PCA_R2]
}

index_labels = ['Mean Squared Error', 'Mean Absolute Error', 'R^2 Score']

df = pd.DataFrame(data, index=index_labels)

df

Unnamed: 0,World Bank without PCA,World Bank with PCA,Kaggle without PCA,Kaggle with PCA
Mean Squared Error,0.902962,5.175261,11.723983,12.566201
Mean Absolute Error,0.730727,1.724019,2.526669,2.639406
R^2 Score,0.988174,0.94139,0.845403,0.827556
