In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

Loading the Advertising Data

In [2]:
df = pd.read_csv("ISL_DataSets\Advertising.csv")
print(df.shape)
print(df.columns)

(200, 5)
Index(['S_No', 'TV', 'radio', 'newspaper', 'sales'], dtype='object')


### Single Variable Regression

without splitting into train and test

In [3]:
#X = df[['TV','radio','newspaper']]
X = df[['TV']]
y = df[['sales']]
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
print(reg.intercept_)

0.611875050850071
[[0.04753664]]
[7.03259355]


Printing values of evaluation metrics

In [5]:
y_pred = reg.predict(X)
print("Mean squared error: %.3f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.5f" % r2_score(y, y_pred))

Mean squared error: 10.513
Coefficient of determination: 0.61188


Splitting the data into train and test

In [13]:
#X = df[['TV','radio','newspaper']]
X = df[['radio']]
y = df[['sales']]
# repeat this activity multiple times and record the R2 and MSE for each run
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
reg2 = LinearRegression().fit(X_train, y_train)
print(reg2.score(X, y))
print(reg2.coef_)
print(reg2.intercept_)
y_pred_test = reg2.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred_test))
print("Coefficient of determination: %.3f" % r2_score(y_test, y_pred_test))

0.331407721956714
[[0.20434528]]
[9.14144567]
Mean squared error: 19.93
Coefficient of determination: 0.319


In [None]:
#repeat the activity for radio and newspaper as well

### Multiple Linear Regression

without splitting into train and test

In [16]:
# 2 variables
X = df[['TV','radio']]
y = df[['sales']]
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
print(reg.intercept_)
y_pred = reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

0.8971942610828956
[[0.04575482 0.18799423]]
[2.92109991]
Mean squared error: 2.78
Coefficient of determination: 0.897


2 variables

In [21]:
# train test split
X = df[['TV','radio']]
y = df[['sales']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
reg2 = LinearRegression().fit(X_train, y_train)
print(reg2.score(X_train, y_train))
print(reg2.coef_)
print(reg2.intercept_)
y_pred_test = reg2.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred_test))
print("Coefficient of determination: %.3f" % r2_score(y_test, y_pred_test))

0.8952493331492486
[[0.04611838 0.18986515]]
[2.8008971]
Mean squared error: 3.03
Coefficient of determination: 0.901


3 variables

In [22]:
X = df[['TV','radio','newspaper']]
y = df[['sales']]
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
print(reg.intercept_)
y_pred = reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

0.8972106381789522
[[ 0.04576465  0.18853002 -0.00103749]]
[2.93888937]
Mean squared error: 2.78
Coefficient of determination: 0.897


splitting into train and test data

In [25]:
# train test split - 3 variables
X = df[['TV','radio','newspaper']]
#X = df[['TV','radio']]
y = df[['sales']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
reg2 = LinearRegression().fit(X_train, y_train)
print(reg2.score(X_train, y_train))
print(reg2.coef_)
print(reg2.intercept_)
y_pred_test = reg2.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred_test))
print("Coefficient of determination: %.3f" % r2_score(y_test, y_pred_test))

0.9085340493755486
[[0.04627265 0.17946534 0.00045695]]
[3.15291007]
Mean squared error: 3.30
Coefficient of determination: 0.861


In [26]:
#compute correlation matrix
X2 = df.drop(columns=['S_No'])
X2.corr()

Unnamed: 0,TV,radio,newspaper,sales
TV,1.0,0.054809,0.056648,0.782224
radio,0.054809,1.0,0.354104,0.576223
newspaper,0.056648,0.354104,1.0,0.228299
sales,0.782224,0.576223,0.228299,1.0


Using Statsmodel Library

In [27]:
X = sm.add_constant(X)
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     570.3
Date:                Mon, 23 Jan 2023   Prob (F-statistic):           1.58e-96
Time:                        10:28:59   Log-Likelihood:                -386.18
No. Observations:                 200   AIC:                             780.4
Df Residuals:                     196   BIC:                             793.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9389      0.312      9.422      0.0

Interaction Model (manually)

In [28]:
X = df[['TV','radio']]
X['TV_Radio'] = df['TV']*df['radio']
print(X.shape)
X.head()

(200, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['TV_Radio'] = df['TV']*df['radio']


Unnamed: 0,TV,radio,TV_Radio
0,230.1,37.8,8697.78
1,44.5,39.3,1748.85
2,17.2,45.9,789.48
3,151.5,41.3,6256.95
4,180.8,10.8,1952.64


In [29]:
y = df[['sales']]
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print(reg.coef_)
print(reg.intercept_)
y_pred = reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

0.9677905498482523
[[0.01910107 0.02886034 0.00108649]]
[6.7502202]
Mean squared error: 0.87
Coefficient of determination: 0.968


In [30]:
X = sm.add_constant(X)
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.968
Model:                            OLS   Adj. R-squared:                  0.967
Method:                 Least Squares   F-statistic:                     1963.
Date:                Mon, 23 Jan 2023   Prob (F-statistic):          6.68e-146
Time:                        10:34:30   Log-Likelihood:                -270.14
No. Observations:                 200   AIC:                             548.3
Df Residuals:                     196   BIC:                             561.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.7502      0.248     27.233      0.0

In [32]:
from sklearn.preprocessing import PolynomialFeatures

In [33]:
poly = PolynomialFeatures(2,interaction_only=True)
X = X = df[['TV','radio']]
y = df[['sales']]
X2 = poly.fit_transform(X)
print(X2.shape)
print(poly.get_feature_names_out())
X2 = sm.add_constant(X2)
mod = sm.OLS(y, X2)
res = mod.fit()
print(res.summary())

(200, 4)
['1' 'TV' 'radio' 'TV radio']
                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.968
Model:                            OLS   Adj. R-squared:                  0.967
Method:                 Least Squares   F-statistic:                     1963.
Date:                Mon, 23 Jan 2023   Prob (F-statistic):          6.68e-146
Time:                        10:35:24   Log-Likelihood:                -270.14
No. Observations:                 200   AIC:                             548.3
Df Residuals:                     196   BIC:                             561.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        

In [34]:
poly = PolynomialFeatures(2) #,interaction_only=True)
X = df[['TV','radio']]
y = df[['sales']]
X2 = poly.fit_transform(X)
print(X2.shape)
print(poly.get_feature_names_out())
X2 = sm.add_constant(X2)
mod = sm.OLS(y, X2)
res = mod.fit()
print(res.summary())

(200, 6)
['1' 'TV' 'radio' 'TV^2' 'TV radio' 'radio^2']
                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.986
Model:                            OLS   Adj. R-squared:                  0.986
Method:                 Least Squares   F-statistic:                     2740.
Date:                Mon, 23 Jan 2023   Prob (F-statistic):          8.17e-178
Time:                        10:36:35   Log-Likelihood:                -186.54
No. Observations:                 200   AIC:                             385.1
Df Residuals:                     194   BIC:                             404.9
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------

Feature Selection (Forward and Backward)

In [35]:
from sklearn.feature_selection import SequentialFeatureSelector

Multiple Regression and Feature Selection using Boston Data

In [36]:
df = pd.read_csv("ISL_DataSets\Boston.csv")
print(df.shape)
print(df.columns)

(506, 14)
Index(['s_no', 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad',
       'tax', 'ptratio', 'lstat', 'medv'],
      dtype='object')


In [37]:
X = df.drop(columns=['s_no','medv'])
print(X.shape)
y = df[['medv']]

(506, 12)


In [38]:
X = sm.add_constant(X)
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                   medv   R-squared:                       0.734
Model:                            OLS   Adj. R-squared:                  0.728
Method:                 Least Squares   F-statistic:                     113.5
Date:                Mon, 23 Jan 2023   Prob (F-statistic):          2.23e-133
Time:                        11:05:55   Log-Likelihood:                -1504.9
No. Observations:                 506   AIC:                             3036.
Df Residuals:                     493   BIC:                             3091.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         41.6173      4.936      8.431      0.0

In [39]:
ols_reg = LinearRegression()
sfs = SequentialFeatureSelector(ols_reg, direction='forward',n_features_to_select=5)
sfs.fit(X, y)
print(sfs.get_feature_names_out())

['crim' 'chas' 'rm' 'ptratio' 'lstat']


In [40]:
ols_reg = LinearRegression()
sfs = SequentialFeatureSelector(ols_reg, direction='backward',n_features_to_select=5)
sfs.fit(X, y)
print(sfs.get_feature_names_out())

['zn' 'nox' 'dis' 'ptratio' 'lstat']


Automating Polynomial Regression and Interaction

In [42]:
poly = PolynomialFeatures(2)
X = df.drop(columns=['s_no','medv'])
y = df[['medv']]
X2 = poly.fit_transform(X)
print(X2.shape)
print(poly.get_feature_names_out())

#X2 = sm.add_constant(X2)
#mod = sm.OLS(y, X2)
#res = mod.fit()
#print(res.summary())
ols_reg = LinearRegression()
print("Forward Selection")
sfs = SequentialFeatureSelector(ols_reg, direction='forward',n_features_to_select=5)
sfs.fit(X2, y)
print(sfs.get_feature_names_out())

print("Backward Selection")
sfs = SequentialFeatureSelector(ols_reg, direction='backward',n_features_to_select=5)
sfs.fit(X2, y)
print(sfs.get_feature_names_out())

(506, 91)
['1' 'crim' 'zn' 'indus' 'chas' 'nox' 'rm' 'age' 'dis' 'rad' 'tax'
 'ptratio' 'lstat' 'crim^2' 'crim zn' 'crim indus' 'crim chas' 'crim nox'
 'crim rm' 'crim age' 'crim dis' 'crim rad' 'crim tax' 'crim ptratio'
 'crim lstat' 'zn^2' 'zn indus' 'zn chas' 'zn nox' 'zn rm' 'zn age'
 'zn dis' 'zn rad' 'zn tax' 'zn ptratio' 'zn lstat' 'indus^2' 'indus chas'
 'indus nox' 'indus rm' 'indus age' 'indus dis' 'indus rad' 'indus tax'
 'indus ptratio' 'indus lstat' 'chas^2' 'chas nox' 'chas rm' 'chas age'
 'chas dis' 'chas rad' 'chas tax' 'chas ptratio' 'chas lstat' 'nox^2'
 'nox rm' 'nox age' 'nox dis' 'nox rad' 'nox tax' 'nox ptratio'
 'nox lstat' 'rm^2' 'rm age' 'rm dis' 'rm rad' 'rm tax' 'rm ptratio'
 'rm lstat' 'age^2' 'age dis' 'age rad' 'age tax' 'age ptratio'
 'age lstat' 'dis^2' 'dis rad' 'dis tax' 'dis ptratio' 'dis lstat' 'rad^2'
 'rad tax' 'rad ptratio' 'rad lstat' 'tax^2' 'tax ptratio' 'tax lstat'
 'ptratio^2' 'ptratio lstat' 'lstat^2']
Forward Selection
['x6' 'x17' 'x63' 'x8

In [None]:
poly = PolynomialFeatures(2,interaction_only=True)
X = df.drop(columns=['s_no','medv'])
y = df[['medv']]
X2 = poly.fit_transform(X)
print(X2.shape)
print(poly.get_feature_names_out())

#X2 = sm.add_constant(X2)
#mod = sm.OLS(y, X2)
#res = mod.fit()
#print(res.summary())

ols_reg = LinearRegression()
print("Forward Selection")
sfs = SequentialFeatureSelector(ols_reg, direction='forward',n_features_to_select=5)
sfs.fit(X2, y)
print(sfs.get_feature_names_out())

print("Backward Selection")
sfs = SequentialFeatureSelector(ols_reg, direction='backward',n_features_to_select=5)
sfs.fit(X2, y)
print(sfs.get_feature_names_out())

kNN Regression Method

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
df = pd.read_csv("ISL_DataSets\Advertising.csv")
neigh = KNeighborsRegressor(n_neighbors=5)
X = df[['TV','radio']]
y = df[['sales']]
knn_reg = neigh.fit(X, y)
print(knn_reg.score(X, y))
#print(reg.coef_)
#print(reg.intercept_)
y_pred = knn_reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

In [None]:
neigh = KNeighborsRegressor(n_neighbors=3)
X = df[['TV','radio']]
y = df[['sales']]
knn_reg = neigh.fit(X, y)
print(knn_reg.score(X, y))
#print(reg.coef_)
#print(reg.intercept_)
y_pred = knn_reg.predict(X)
print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

In [None]:
sfs = SequentialFeatureSelector(neigh, n_features_to_select=1)
sfs.fit(X, y)
print(sfs.get_support())
print(sfs.get_feature_names_out())

In [None]:
df = pd.read_csv("ISL_DataSets\Boston.csv")
X = df.drop(columns=['s_no','medv'])
y = df[['medv']]
sfs = SequentialFeatureSelector(neigh, direction='forward',n_features_to_select=5)
sfs.fit(X, y)
print(sfs.get_feature_names_out())

In [None]:
sfs = SequentialFeatureSelector(neigh, direction='backward',n_features_to_select=5)
sfs.fit(X, y)
print(sfs.get_feature_names_out())