In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('supershops.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [3]:
# df.dropna() # remove all nan

In [4]:
df.dropna(inplace=True)

In [5]:
x= df.drop(['Profit','Area'],axis=1)
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport
0,114523.61,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [6]:
y = df[['Profit']]

In [7]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.25,random_state=0) 
#you can use any random integer number as random_state

In [8]:
xtrain.shape

(36, 3)

In [9]:
xtrain.head()

Unnamed: 0,Marketing Spend,Administration,Transport
28,66051.52,182645.56,118148.2
49,0.0,116983.8,45173.06
2,153441.51,101145.55,407934.54
47,0.0,135426.92,0.0
18,91749.16,114175.79,294919.57


In [10]:
ytrain.head()

Unnamed: 0,Profit
28,103282.38
49,14681.4
2,191050.39
47,42559.73
18,124266.9


In [11]:
# df.corr()

# OLS

In [12]:
model = LinearRegression()
model.fit(xtrain,ytrain)

In [13]:
y_pred = model.predict(xtest)

In [14]:
y_pred

array([[ 90707.18524202],
       [166377.24276987],
       [124018.59727829],
       [ 93252.51801345],
       [ 97588.41924098],
       [ 68948.39245538],
       [ 89037.14295216],
       [ 73472.98068535],
       [159657.23912108],
       [129149.84623501],
       [128674.69774365],
       [ 88409.42998689],
       [ 96436.18820079]])

In [15]:
mse = mean_squared_error(ytest, y_pred)
print('MSE:', mse)

MSE: 101360809.28512499


In [16]:
model.score(xtest,ytest)

0.8744319145336102

# polynomial 

In [17]:
from sklearn.preprocessing import PolynomialFeatures

In [18]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport
0,114523.61,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


# Degree 2

In [19]:
poly = PolynomialFeatures(degree = 2)
x_poly = poly.fit_transform(x)

In [20]:
pd.DataFrame(x_poly).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,114523.61,136897.8,471784.1,13115660000.0,15678030000.0,54030420000.0,18741010000.0,64586210000.0,222580200000.0
1,1.0,162597.7,151377.59,443898.53,26438010000.0,24613650000.0,72176880000.0,22915170000.0,67196290000.0,197045900000.0
2,1.0,153441.51,101145.55,407934.54,23544300000.0,15519930000.0,62594090000.0,10230420000.0,41260760000.0,166410600000.0
3,1.0,144372.41,118671.85,383199.62,20843390000.0,17132940000.0,55323450000.0,14083010000.0,45475010000.0,146841900000.0
4,1.0,142107.34,91391.77,366168.42,20194500000.0,12987440000.0,52035220000.0,8352456000.0,33464780000.0,134079300000.0


In [21]:
xtrain,xtest,ytrain,ytest = train_test_split(x_poly,y,test_size=0.25,random_state =0)

In [22]:
xtrain.shape

(36, 10)

In [23]:
model2 = LinearRegression()
model2.fit(xtrain,ytrain)

In [24]:
y_pred2 = model2.predict(xtest)
y_pred2

array([[101606.42692695],
       [164653.7682662 ],
       [131119.04611802],
       [104593.86990522],
       [ 99609.91567389],
       [ 68898.23379901],
       [ 92076.37087923],
       [ 72287.07989249],
       [154325.98721721],
       [128236.4189747 ],
       [128001.27841582],
       [110500.67878648],
       [ 76394.97789304]])

In [25]:
mse= mean_squared_error(ytest,y_pred2)
print('MSE:',mse)

MSE: 162866427.11400354


In [26]:
model2.score(xtest,ytest) 
#testing scrore

0.7982373504740954

In [27]:
model2.score(xtrain,ytrain)
#training score

0.9505396347868063

# Degree 3

In [28]:
poly = PolynomialFeatures(degree=3) # polynomial regression with degree 3
X_poly_deg3 = poly.fit_transform(x)

In [29]:
X_poly_deg3.shape

(49, 20)

In [30]:
xtrain, xtest, ytrain, ytest = train_test_split(X_poly_deg3, y, test_size=0.25, random_state=0)

In [31]:
model3 = LinearRegression()
model3.fit(xtrain, ytrain)

In [32]:
model3.score(xtest, ytest) # testing

0.6155763575572593

In [33]:
model3.score(xtrain, ytrain) # training

0.9650079988239769

# Let's talk about Regularization

In [34]:
x = df.drop(['Profit','Area'], axis=1)
y = df[['Profit']]

In [35]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=0)

In [36]:
xtrain.head()

Unnamed: 0,Marketing Spend,Administration,Transport
28,66051.52,182645.56,118148.2
49,0.0,116983.8,45173.06
2,153441.51,101145.55,407934.54
47,0.0,135426.92,0.0
18,91749.16,114175.79,294919.57


# L1 Regularization with LR - Lasso

In [37]:
lasso_model = Lasso()

lasso_model.fit(xtrain,ytrain)

# L2 Regularization with LR - Ridge

In [39]:
ridge_model = Ridge()
ridge_model.fit(xtrain, ytrain)

# Performance

In [43]:
lasso_predictions = lasso_model.predict(xtest)
lasso_mse =mean_squared_error(ytest,lasso_predictions)
ridge_predictions = ridge_model.predict(xtest)
ridge_mse = mean_squared_error(ytest, ridge_predictions)

print('Lasso MSE:', lasso_mse)
print('Ridge MSE:', ridge_mse)

Lasso MSE: 101360808.3578472
Ridge MSE: 101360809.28872347


In [44]:
lasso_model.score(xtest,ytest)

0.8744319156823431

In [45]:
ridge_model.score(xtest,ytest)

0.8744319145291524