In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [None]:
data = load_boston()
feature_names = data.feature_names
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MEDV'] = pd.Series(data.target)

In [None]:
#MEDV is the output so we drop it from df_X

df_X=df.drop('MEDV', axis=1, inplace=False)
df_Y=df['MEDV']

In [None]:
#Testing various hypothesis with different degrees
########################################################

#rms_array is the array tht contains error for each degree so that
#we can know the best degree by getting the lowest error in the array

rms_array=[]

#for loop loops on different degrees from 1 to 8
#LinearRegression is used in this assignment
#Validation is used so that y-val is compared to X_val
for i in range(1,8):
  poly = PolynomialFeatures(degree= i, include_bias=False)
  poly_features = poly.fit_transform(df_X)
  X_train, X_test, y_train, y_test = train_test_split(poly_features, df_Y, test_size=0.4, random_state=1)
  X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=1)
  Reg_type = LinearRegression()
  Reg_type.fit(X_train, y_train)
  poly_reg_y_predicted = Reg_type.predict(X_val)
  Reg_err = np.sqrt(mean_squared_error(y_val, poly_reg_y_predicted))
  rms_array.append(Reg_err) 

In [None]:
print(rms_array)

[5.456971927684733, 5.707016121069094, 75.68075671685065, 259.70721960205583, 291.3397813754658, 221.44143413912627, 226.15136714697434]


In [None]:
#Test on degree with the lowest error from rms_array to make sure that no big difference between them
poly = PolynomialFeatures(degree= 1, include_bias=False)
poly_features = poly.fit_transform(df_X)
X_train, X_test, y_train, y_test = train_test_split(poly_features, df_Y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=1)
Reg_type = LinearRegression()
Reg_type.fit(X_train, y_train)
poly_reg_y_predicted = Reg_type.predict(X_test)
Reg_err = np.sqrt(mean_squared_error(y_test, poly_reg_y_predicted))
Reg_err

4.547289065278924

In [None]:
#Applying Regularization
#Theta is deacreased near to zero but not equal to zero
#alpha is the intensity of the penalty

from sklearn.linear_model import Lasso
 
alpha_arr=[0.00001,0.0001,0.001,0.01,0.1] 
for i in alpha_arr:  
  lasso = Lasso(alpha=i, max_iter=10e5)
  lasso.fit(X_train,y_train)
  train_score=lasso.score(X_train,y_train)
  test_score=lasso.score(X_test,y_test) 
  print ('Training score for alpha= ' + str(i) + ' :' , train_score) 
  print ('Test score for alpha = ' + str(i) + ' :' , test_score)


training score for alpha= 1e-05 : 0.7468316515844864
test score for alpha = 1e-05 : 0.7366333554323465
training score for alpha= 0.0001 : 0.746831609061166
test score for alpha = 0.0001 : 0.7366212347000438
training score for alpha= 0.001 : 0.7468273562228122
test score for alpha = 0.001 : 0.7364961612391743
training score for alpha= 0.01 : 0.7464028354764276
test score for alpha = 0.01 : 0.7348628113960634
training score for alpha= 0.1 : 0.7324473082335632
test score for alpha = 0.1 : 0.7156255257568283
