In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt

# Predict the World Bank Dataset without PCA

In [2]:
X_train = pd.read_csv('Datasets/World Bank Transformed/X_train without PCA.csv').values
y_train = pd.read_csv('Datasets/World Bank Transformed/y_train.csv').values
X_test = pd.read_csv('Datasets/World Bank Transformed/X_test without PCA.csv').values
y_test = pd.read_csv('Datasets/World Bank Transformed/y_test.csv').values

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [3]:
WB_MAE = metrics.mean_absolute_error(y_test, y_pred)
WB_MSE = metrics.mean_squared_error(y_test, y_pred)
print('Mean Absolute Error:', WB_MAE)
print('\nMean Squared Error:', WB_MSE)  

WB_R2 = model.score(X_test,y_test)
print('\nR^2 Score:', WB_R2, '\n')


Mean Absolute Error: 0.7253896974050467

Mean Squared Error: 0.8601419872544585

R^2 Score: 0.9895329187638151 



# Predict the World Bank Dataset with PCA

In [4]:
X_train = pd.read_csv('Datasets/World Bank Transformed/X_train with PCA.csv').values
y_train = pd.read_csv('Datasets/World Bank Transformed/y_train.csv').values
X_test = pd.read_csv('Datasets/World Bank Transformed/X_test with PCA.csv').values
y_test = pd.read_csv('Datasets/World Bank Transformed/y_test.csv').values

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [5]:
WB_PCA_MAE = metrics.mean_absolute_error(y_test, y_pred)
WB_PCA_MSE = metrics.mean_squared_error(y_test, y_pred)
print('Mean Absolute Error:', WB_PCA_MAE)
print('\nMean Squared Error:', WB_PCA_MSE) 

WB_PCA_R2 = model.score(X_test,y_test)
print('\nR^2 Score:', WB_PCA_R2, '\n')



Mean Absolute Error: 1.7087409715254052

Mean Squared Error: 4.870400269335707

R^2 Score: 0.9407320232853693 



# Predict the Kaggle Dataset without PCA

In [6]:
X_train = pd.read_csv('Datasets/Kaggle Transformed/X_train without PCA.csv').values
y_train = pd.read_csv('Datasets/Kaggle Transformed/y_train.csv').values
X_test = pd.read_csv('Datasets/Kaggle Transformed/X_test without PCA.csv').values
y_test = pd.read_csv('Datasets/Kaggle Transformed/y_test.csv').values

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [7]:
KG_MAE = metrics.mean_absolute_error(y_test, y_pred)
KG_MSE = metrics.mean_squared_error(y_test, y_pred)
print('Mean Absolute Error:', KG_MAE)
print('\nMean Squared Error:', KG_MSE)  

KG_R2 = model.score(X_test,y_test)
print('\nR^2 Score:', KG_R2, '\n')



Mean Absolute Error: 2.521698829435405

Mean Squared Error: 11.775440235272256

R^2 Score: 0.8678234113603497 



# Predict the Kaggle Dataset with PCA

In [8]:
X_train = pd.read_csv('Datasets/Kaggle Transformed/X_train with PCA.csv').values
y_train = pd.read_csv('Datasets/Kaggle Transformed/y_train.csv').values
X_test = pd.read_csv('Datasets/Kaggle Transformed/X_test with PCA.csv').values
y_test = pd.read_csv('Datasets/Kaggle Transformed/y_test.csv').values

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [9]:
KG_PCA_MAE = metrics.mean_absolute_error(y_test, y_pred)
KG_PCA_MSE = metrics.mean_squared_error(y_test, y_pred)
print('Mean Absolute Error:', KG_PCA_MAE)
print('\nMean Squared Error:', KG_PCA_MSE)  

KG_PCA_R2 = model.score(X_test,y_test)
print('\nR^2 Score:', KG_PCA_R2, '\n')



Mean Absolute Error: 2.6868164763871825

Mean Squared Error: 13.50945894997814

R^2 Score: 0.848359453005689 



# Conclusion

In [10]:
data = {
    'World Bank without PCA': [WB_MSE, WB_MAE, WB_R2],
    'World Bank with PCA': [WB_PCA_MSE, WB_PCA_MAE, WB_PCA_R2],
    'Kaggle without PCA': [KG_MSE, KG_MAE, KG_R2],
    'Kaggle with PCA': [KG_PCA_MSE, KG_PCA_MAE, KG_PCA_R2]
}

index_labels = ['Mean Squared Error', 'Mean Absolute Error', 'R^2 Score']

df = pd.DataFrame(data, index=index_labels)

df

Unnamed: 0,World Bank without PCA,World Bank with PCA,Kaggle without PCA,Kaggle with PCA
Mean Squared Error,0.860142,4.8704,11.77544,13.509459
Mean Absolute Error,0.72539,1.708741,2.521699,2.686816
R^2 Score,0.989533,0.940732,0.867823,0.848359
