# Machine Learning Tutorial 2 - Multiple Linear Regression Model

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('50_Startups.csv')

In [5]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   49 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [9]:
df.shape

(50, 5)

In [11]:
df.isnull().sum()

R&D Spend          0
Administration     1
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [13]:
df['Administration'].describe()

count        49.000000
mean     120851.059592
std       28087.658281
min       51283.140000
25%      103057.490000
50%      122616.840000
75%      144135.980000
max      182645.560000
Name: Administration, dtype: float64

In [15]:
df['Administration'] = df['Administration'].fillna(df['Administration'].mean())

In [17]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [20]:
State_dummy = pd.get_dummies(df['State'], drop_first=True)
State_dummy

Unnamed: 0,Florida,New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0
5,0,1
6,0,0
7,1,0
8,0,1
9,0,0


In [22]:
df1 = pd.concat([df, State_dummy], axis=1)

In [24]:
df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,Florida,New York
0,165349.2,136897.8,471784.1,New York,192261.83,0,1
1,162597.7,151377.59,443898.53,California,191792.06,0,0
2,153441.51,101145.55,407934.54,Florida,191050.39,1,0
3,144372.41,118671.85,383199.62,New York,182901.99,0,1
4,142107.34,91391.77,366168.42,Florida,166187.94,1,0


In [26]:
df2= df1.drop(['State'], axis=1)

In [28]:
df2.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [31]:
X = df2.drop(['Profit'],axis=1)
y = df2['Profit']

In [33]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [35]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [37]:
from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [41]:
len(X_train)

40

In [43]:
len(X_test)

10

In [44]:
from sklearn.linear_model import LinearRegression

In [46]:
model_mlr = LinearRegression()

In [48]:
model_mlr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [50]:
model_mlr.predict(X_test)

array([103265.23521445, 132485.20795435, 132478.75659208,  71939.53387803,
       178552.3128569 , 116205.67576919,  67690.36350872,  98872.29719019,
       114060.75475303, 167901.9110407 ])

In [52]:
y_test

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
Name: Profit, dtype: float64

In [53]:
model_mlr.score(X_test, y_test)

0.9341149394808592