In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [5]:
df=pd.read_csv(r"..\Datasets\Boston.csv")

y = df['medv'] # y = df.iloc[:,-1]  # Dependent Variable
X = df.drop('medv', axis=1)  # Independent Variable

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=24)
poly=PolynomialFeatures(degree=1).set_output(transform="pandas")
lr=LinearRegression()
pipe = Pipeline([('POLY',poly),('LR',lr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("R^2 Score: ",r2_score(y_test,y_pred))

kfold = KFold(n_splits=5, random_state=24, shuffle=True)

results = cross_val_score(pipe, X, y, scoring='r2', cv=kfold)
print('Cross Validation Score: ',results.mean())

R^2 Score:  0.7133431144123453
Cross Validation Score:  0.713960346536948


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=24)

poly=PolynomialFeatures(degree=2).set_output(transform="pandas")
lr=LinearRegression()
pipe = Pipeline([('POLY',poly),('LR',lr)])
pipe.fit(X_train, y_train)
# poly.fit_transform(X_train)
# X_poly_train = poly.transform(X_train)
# lr.fit(X_poly_train,y_train)

y_pred = pipe.predict(X_test)
# X_poly_test = poly.transform(X_test)
# y_pred = lr.predict(X_poly_test)
print("R^2 Score: ",r2_score(y_test,y_pred))

kfold = KFold(n_splits=5, random_state=24, shuffle=True)

results = cross_val_score(pipe, X, y, scoring='r2', cv=kfold)
print('Cross Validation Score: ',results.mean())

R^2 Score:  0.6881556660859667
Cross Validation Score:  0.6786672922610152


In [8]:
poly=PolynomialFeatures(degree=3).set_output(transform="pandas")
lr=LinearRegression()
pipe = Pipeline([('POLY',poly),('LR',lr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("R^2 Score: ",r2_score(y_test,y_pred))

kfold = KFold(n_splits=5, random_state=24, shuffle=True)

results = cross_val_score(pipe, X, y, scoring='r2', cv=kfold)
print('Cross Validation Score: ',results.mean())

R^2 Score:  -8442.445302259555
Cross Validation Score:  -3414.016057048734


In [9]:
poly=PolynomialFeatures(degree=4).set_output(transform="pandas")
lr=LinearRegression()
pipe = Pipeline([('POLY',poly),('LR',lr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("R^2 Score: ",r2_score(y_test,y_pred))

kfold = KFold(n_splits=5, random_state=24, shuffle=True)

results = cross_val_score(pipe, X, y, scoring='r2', cv=kfold)
print('Cross Validation Score: ',results.mean())

R^2 Score:  -1370.9553199808956
Cross Validation Score:  -6311.193149577735


In [10]:
poly=PolynomialFeatures(degree=5).set_output(transform="pandas")
lr=LinearRegression()
pipe = Pipeline([('POLY',poly),('LR',lr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("R^2 Score: ",r2_score(y_test,y_pred))

kfold = KFold(n_splits=5, random_state=24, shuffle=True)

results = cross_val_score(pipe, X, y, scoring='r2', cv=kfold)
print('Cross Validation Score: ',results.mean())

R^2 Score:  -5679.908753656253
Cross Validation Score:  -5738.846659257908


In [11]:
kfold = KFold(n_splits=5, random_state=24, shuffle=True)
lr=LinearRegression()
degrees=[1,2,3,4,5]
scores=[]

for i in degrees:
    poly=PolynomialFeatures(degree=i)
    pipe = Pipeline([('POLY',poly),('LR',lr)])
    results = cross_val_score(pipe, X, y, cv=kfold)
    scores.append(results.mean())

i_max = np.argmax(scores)
print("Best Degree: ",degrees[i_max])
print("Best Cross Val Score: ",scores[i_max])

Best Degree:  1
Best Cross Val Score:  0.713960346536948


In [12]:
from sklearn.model_selection import GridSearchCV

print(pipe.get_params())
params = {'POLY__degree':[1,2,3,4,5]}

# By default GridSearchCV(), calculates 'r2' for regression as scoring
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold)

gcv.fit(X,y)

print(gcv.best_params_)
print(gcv.best_score_)

{'memory': None, 'steps': [('POLY', PolynomialFeatures(degree=5)), ('LR', LinearRegression())], 'verbose': False, 'POLY': PolynomialFeatures(degree=5), 'LR': LinearRegression(), 'POLY__degree': 5, 'POLY__include_bias': True, 'POLY__interaction_only': False, 'POLY__order': 'C', 'LR__copy_X': True, 'LR__fit_intercept': True, 'LR__n_jobs': None, 'LR__positive': False}
{'POLY__degree': 1}
0.713960346536948
