In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('../data/50_Startups.csv', delimiter=',')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [2]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          49 non-null float64
Administration     49 non-null float64
Marketing Spend    49 non-null float64
State              49 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [3]:
dataset.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,49.0,49.0,49.0,50.0
mean,74093.608776,121315.290408,215331.732449,112012.6392
std,46301.730529,28307.373863,119665.39155,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,38558.51,103057.49,134050.07,90138.9025
50%,73994.56,122616.84,214634.81,107978.19
75%,101913.08,145077.58,299737.29,139765.9775
max,165349.2,182645.56,471784.1,192261.83


# Dealing With Null Data

In [4]:
dataset.isnull().any()

R&D Spend           True
Administration      True
Marketing Spend     True
State               True
Profit             False
dtype: bool

In [5]:
dataset.median()

R&D Spend           73994.56
Administration     122616.84
Marketing Spend    214634.81
Profit             107978.19
dtype: float64

In [6]:
dataset.loc[:, ['R&D Spend', 'Administration', 'Marketing Spend']] = dataset.loc[:, ['R&D Spend', 'Administration', 'Marketing Spend']].fillna(dataset.median())

In [7]:
dataset.isnull().any()

R&D Spend          False
Administration     False
Marketing Spend    False
State               True
Profit             False
dtype: bool

#  Categorical Missing Sata

In [8]:
dataset.State.unique()

array(['New York', 'California', 'Florida', nan], dtype=object)

In [9]:
dataset.State.describe()

count           49
unique           3
top       New York
freq            17
Name: State, dtype: object

In [10]:
dataset.State.mode()[0]

'New York'

In [11]:
dataset.State.fillna(dataset.State.mode()[0], inplace=True)

In [12]:
dataset.isnull().any()

R&D Spend          False
Administration     False
Marketing Spend    False
State              False
Profit             False
dtype: bool

# Dealing with Dummy Variabls

In [13]:
dataset = pd.get_dummies(dataset, drop_first = True)#dataset['State'], prefix = 'category', drop_first = True

In [14]:
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0
5,131876.9,99814.71,362861.36,156991.12,0,1
6,134615.46,147198.87,127716.82,156122.51,0,0
7,130298.13,145530.06,323876.68,155752.6,1,0
8,120542.52,148718.95,311613.29,152211.77,0,1
9,123334.88,108679.17,304981.62,149759.96,0,0


#  Data Validation

In [15]:
dataset.drop(columns='Profit')

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0
5,131876.9,99814.71,362861.36,0,1
6,134615.46,147198.87,127716.82,0,0
7,130298.13,145530.06,323876.68,1,0
8,120542.52,148718.95,311613.29,0,1
9,123334.88,108679.17,304981.62,0,0


In [16]:
X = dataset.drop(columns='Profit').values
y = dataset['Profit'].values.reshape(-1, 1)

#  Data Validation

In [17]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#  Data Scaling

In [18]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)

# Model Training

In [19]:
from sklearn.linear_model import LinearRegression
MLR = LinearRegression()

MLR.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
MLR.intercept_, MLR.coef_ # w0, w1,..,w5

(array([4.34262726e-17]),
 array([[ 0.8560062 ,  0.02746304,  0.1355041 , -0.0240939 ,  0.01265784]]))

# Model Prediction

In [22]:
y_pred = MLR.predict(X_test)
y_pred

array([[-0.20304962],
       [ 0.54846668],
       [ 0.51239928],
       [-0.96253069],
       [ 1.6621197 ],
       [ 0.20033414],
       [-1.01923819],
       [-0.27829681],
       [ 0.09129264],
       [ 1.39389698]])

##  Inverse Scaling

In [23]:
sc_y.inverse_transform(y_pred)

array([[101229.90710752],
       [131640.52222306],
       [130181.03049651],
       [ 70496.99244654],
       [176705.24398542],
       [117553.10341926],
       [ 68202.28528884],
       [ 98184.97892362],
       [113140.66559627],
       [165851.43043571]])

# Model Validation 

In [24]:
import sklearn.metrics as mc

In [25]:
rms = np.sqrt(mc.mean_squared_error(y_test, y_pred))
print(rms)

127399.17641447148


In [None]:
print('Thank You')