# Multiple Linear Regression

1. Loading the Dataset
2. Descrptive statistics of Data 
3. Data Cleaning
4. Data splitting into independendent and Dependent variables
5. Label encoding 
6. One hot encoding
7. Train-Test Split
8. Model Building
9. Model Summary
10. Model Testing/Evaluation

In [1]:
    #Loading Dependencies
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True) #prevent numpy exponential
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error

# 1. Loading the Dataset

In [2]:
data = pd.read_csv("MLR_DATA.csv")
print(data.head())


   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34             NaN        366168.42     Florida  166187.94


# 2. Descrptive statistics of Data

In [3]:
print(data.shape)
print(data.info())
print(data.describe())


(50, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     48 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB
None
           R&D Spend  Administration  Marketing Spend         Profit
count      50.000000       48.000000        50.000000      50.000000
mean    73721.615600   123428.272292    211025.097800  112012.639200
std     45902.256482    26276.227003    122290.310726   40306.180338
min         0.000000    51743.150000         0.000000   14681.400000
25%     39936.370000   107947.135000    129300.132500   90138.902500
50%     73051.080000   123467.895000    212716.240000  107978.190000
75%    101602.800000   145190.700000    299469.085000  139765.977500
max    165349.200000   182645.560000    471784.100000  192261.830000


# 3. Data Cleaning

In [4]:
print(data.isnull().any())
print(data.isnull().sum())

R&D Spend          False
Administration      True
Marketing Spend    False
State              False
Profit             False
dtype: bool
R&D Spend          0
Administration     2
Marketing Spend    0
State              0
Profit             0
dtype: int64


In [5]:
#Filling with mean value
data.Administration = data.Administration.fillna(data.Administration.mean())
#print(data.isnull().any())

# 4. Split the data into X & Y

In [6]:
#Split the data into X & Y
X = data.iloc[:, :-1].values
Y = data.iloc[:, 4].values
#.iloc is used whenever index based position locating is carried out in pandas
#.values to convert dataframe to numpy array

# 5. Label encoding

In [7]:
#Encoding Categorical into Numerals
LE = LabelEncoder()
X[:,3] = LE.fit_transform(X[:, 3])
#X

In [8]:
#X.iloc[1,:]
#X.iloc[1,:].values

# 6. One hot encoding

In [9]:
#Onehot Encoding
OHE = OneHotEncoder(categorical_features = [3])
X = OHE.fit_transform(X).toarray()
#print(data.head())
X      #3 dummy variables are created for state variable

array([[     0.        ,      0.        ,      1.        ,
        165349.2       , 136897.8       , 471784.1       ],
       [     1.        ,      0.        ,      0.        ,
        162597.7       , 151377.59      , 443898.53      ],
       [     0.        ,      1.        ,      0.        ,
        153441.51      , 101145.55      , 407934.54      ],
       [     0.        ,      0.        ,      1.        ,
        144372.41      , 118671.85      , 383199.62      ],
       [     0.        ,      1.        ,      0.        ,
        142107.34      , 123428.27229167, 366168.42      ],
       [     0.        ,      0.        ,      1.        ,
        131876.9       ,  99814.71      , 362861.36      ],
       [     1.        ,      0.        ,      0.        ,
        134615.46      , 147198.87      , 127716.82      ],
       [     0.        ,      1.        ,      0.        ,
        130298.13      , 145530.06      , 323876.68      ],
       [     0.        ,      0.        ,      1

In [10]:
#Avoiding Dummy Variable Trap
X = X[:, 1:]
X

array([[     0.        ,      1.        , 165349.2       ,
        136897.8       , 471784.1       ],
       [     0.        ,      0.        , 162597.7       ,
        151377.59      , 443898.53      ],
       [     1.        ,      0.        , 153441.51      ,
        101145.55      , 407934.54      ],
       [     0.        ,      1.        , 144372.41      ,
        118671.85      , 383199.62      ],
       [     1.        ,      0.        , 142107.34      ,
        123428.27229167, 366168.42      ],
       [     0.        ,      1.        , 131876.9       ,
         99814.71      , 362861.36      ],
       [     0.        ,      0.        , 134615.46      ,
        147198.87      , 127716.82      ],
       [     1.        ,      0.        , 130298.13      ,
        145530.06      , 323876.68      ],
       [     0.        ,      1.        , 120542.52      ,
        148718.95      , 311613.29      ],
       [     0.        ,      0.        , 123334.88      ,
        108679.17      

# 7. Train-Test Split

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
X_train = pd.DataFrame(X_train)
#X_train
X_test.shape

(10, 5)

# 8. Model Building

In [12]:
LR = LinearRegression()
mlr_reg = LR.fit(X_train, Y_train)
mlr_reg

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# 9. Model Summary

In [13]:
print("Coefficients of the MLR Model:\n", mlr_reg.coef_)
print("\n Intercept of the MLR Model:\n", mlr_reg.intercept_)
print("\n R_square value of the MLR Model:\n", mlr_reg.score(X_train, Y_train))

Coefficients of the MLR Model:
 [-783.02407331  982.96059598    0.76935778    0.05071577    0.03732572]

 Intercept of the MLR Model:
 40268.98329768915

 R_square value of the MLR Model:
 0.9505092969949016


# 10. Model Testing/Evaluation

In [14]:
Y_train_pred = mlr_reg.predict(X_train)
MSE_Train = mean_squared_error(Y_train, Y_train_pred)
print("RMSE of Training set: ", np.sqrt(MSE_Train))
#print(MSE_Train)
print(" -"*25)

Y_test_pred = mlr_reg.predict(X_test)
MSE_Test = mean_squared_error(Y_test, Y_test_pred)
print("RMSE of Test set: ", np.sqrt(MSE_Test))
#print(MSE_Test)


RMSE of Training set:  9002.198976631764
 - - - - - - - - - - - - - - - - - - - - - - - - -
RMSE of Test set:  9609.172808064694


In [15]:
print(Y_test)
print(Y_test_pred)

[103282.38 144259.4  146121.95  77798.83 191050.39 105008.31  81229.06
  97483.56 110352.25 166187.94]
[103976.18633735 131698.86509089 132056.04262578  71380.75341604
 177893.50291811 116396.07531112  67075.48923205  99324.89747839
 113962.8804579  168744.6059647 ]


In [16]:
#?train_test_split