In [37]:
# !pip install numpy pandas matplotlib seaborn scikit-learn pytest

In [53]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
%matplotlib inline

In [39]:
# Reading CSV file and splitting the dependent and independent variables

companies = pd.read_csv("1000_Companies.csv")
X = companies.iloc[:,:-1].values
y = companies.iloc[:,4].values

companies.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [40]:
companies.shape

(1000, 5)

In [41]:
companies.drop(columns="State").corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.582434,0.978407,0.945245
Administration,0.582434,1.0,0.520465,0.74156
Marketing Spend,0.978407,0.520465,1.0,0.91727
Profit,0.945245,0.74156,0.91727,1.0


In [42]:
companies.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,1000.0,1000.0,1000.0,1000.0
mean,81668.9272,122963.897612,226205.058419,119546.164656
std,46537.567891,12613.927535,91578.393542,42888.633848
min,0.0,51283.14,0.0,14681.4
25%,43084.5,116640.68485,150969.5846,85943.198543
50%,79936.0,122421.61215,224517.88735,117641.4663
75%,124565.5,129139.118,308189.808525,155577.107425
max,165349.2,321652.14,471784.1,476485.43


In [43]:
# Encoding categorical data

from sklearn.preprocessing import LabelEncoder , OneHotEncoder
labelenconder = LabelEncoder()
X[:,3] = labelenconder.fit_transform(X[:,3])

In [44]:
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('State', OneHotEncoder(), [3])], remainder='passthrough')
X = ct.fit_transform(X)

In [45]:
# Avoding the Dummy Variable Trap

X = X[:,1:]

In [46]:
# Splitting the dataset into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X , y , test_size = 0.2 , random_state = 0 )

In [47]:
# Fitting Multiple Linear Regression 

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [48]:
y_pred = regressor.predict(X_test)
y_pred

array([ 89790.61532915,  88427.07187361,  94894.67836972, 175680.8672561 ,
        83411.73042089, 110571.90200074, 132145.22936441,  91473.37719686,
       164597.05380606,  53222.82667401,  66950.1905099 , 150566.43987004,
       126915.20858596,  59337.8597105 , 177513.91053061,  75316.28143051,
       118248.14406603, 164574.40699902, 170937.2898107 , 182069.11645084,
       118845.03252689,  85669.95112229, 180992.59396143,  84145.08220145,
       105005.83769214, 101233.56772747,  53831.07669091,  56881.41475224,
        68896.39346905, 210040.00765883, 120778.72270894, 111724.87157654,
       101487.90541518, 137959.02649624,  63969.95996743, 108857.91214126,
       186014.72531988, 171442.64130747, 174644.26529204, 117671.49128195,
        96731.37857433, 165452.25779409, 107724.34331255,  50194.54176913,
       116513.89532178,  58632.48986821, 158416.46827609,  78541.4852161 ,
       159727.66671743, 131137.87699644, 184880.70924516, 174609.0826688 ,
        93745.66352059,  

In [49]:
regressor.coef_

array([-8.80536598e+02, -6.98169073e+02,  5.25845857e-01,  8.44390881e-01,
        1.07574255e-01])

In [50]:
regressor.intercept_

-51035.22972403784

In [51]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9112695892268856

In [54]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print('Mean Absolute Error:', mae)
print('Mean Square Error:', mse)
print('Root Mean Square Error:', r2)

Mean Absolute Error: 2300.221750984347
Mean Square Error: 192148061.81506786
Root Mean Square Error: 13861.748151480313
