In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [11]:
df = pd.read_csv('regressionexample.csv') 
print(df.shape)
df.describe()

(645, 5)


Unnamed: 0,pce,pop,psavert,uempmed,unemploy
count,645.0,645.0,645.0,645.0,645.0
mean,3793.782481,225068.87907,9.014109,8.851473,5543.578295
std,3330.472532,38990.264662,2.934286,4.056325,2384.676941
min,306.1,156309.0,2.2,1.9,1596.0
25%,761.6,194087.0,6.7,6.2,3510.0
50%,2794.7,223477.0,9.4,8.0,5523.0
75%,6196.2,255992.0,11.4,9.5,7423.0
max,11061.5,296707.0,17.3,25.2,12051.0


In [12]:
target_column = ['unemploy'] 
predictors = list(set(list(df.columns))-set(target_column))
df[predictors] = df[predictors]/df[predictors].max()
df.describe()

Unnamed: 0,pce,pop,psavert,uempmed,unemploy
count,645.0,645.0,645.0,645.0,645.0
mean,0.342972,0.758556,0.521047,0.351249,5543.578295
std,0.301087,0.13141,0.169612,0.160965,2384.676941
min,0.027673,0.526813,0.127168,0.075397,1596.0
25%,0.068851,0.654137,0.387283,0.246032,3510.0
50%,0.252651,0.753191,0.543353,0.31746,5523.0
75%,0.560159,0.862777,0.65896,0.376984,7423.0
max,1.0,1.0,1.0,1.0,12051.0


In [13]:
X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
print(X_train.shape)
print(X_test.shape)

(451, 4)
(194, 4)


In [14]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
pred_train_lr= lr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_lr)))
print(r2_score(y_train, pred_train_lr))

pred_test_lr= lr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_lr))) 
print(r2_score(y_test, pred_test_lr))

1352.9721051200022
0.6877129658301824
1327.376180839287
0.6577103447635149


In [17]:
rr = Ridge(alpha=0.01)
rr.fit(X_train, y_train) 
pred_train_rr= rr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rr)))
print(r2_score(y_train, pred_train_rr))

pred_test_rr= rr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rr))) 
print(r2_score(y_test, pred_test_rr))

1353.10301022716
0.6876525330316612
1326.8733383346748
0.6579696309987448


In [18]:
model_lasso = Lasso(alpha=0.01)
model_lasso.fit(X_train, y_train) 
pred_train_lasso= model_lasso.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_lasso)))
print(r2_score(y_train, pred_train_lasso))

pred_test_lasso= model_lasso.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_lasso))) 
print(r2_score(y_test, pred_test_lasso))

1352.9721841498701
0.6877129293475326
1327.3563045794724
0.6577205956438302


In [19]:
#Elastic Net
model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(X_train, y_train) 
pred_train_enet= model_enet.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_enet)))
print(r2_score(y_train, pred_train_enet))

pred_test_enet= model_enet.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_enet)))
print(r2_score(y_test, pred_test_enet))

1537.8523702442437
0.5965352839950191
1508.598961085909
0.5578665813838257
