### Import the Library

In [1]:
import numpy as np
import pandas as pd
from sklearn import *

### Import the dataset

In [2]:
dataset=pd.read_csv("50_CompList.csv")
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,4].values

### Encoding dummy variables : categorical feature convert to numerical value

In [3]:
ct = compose.ColumnTransformer([("State", preprocessing.OneHotEncoder(), [3])],remainder = 'passthrough')
x = ct.fit_transform(x)
print(x)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

### As we can see in the above output, the state column has been converted into dummy variables (0 and 1).The first column corresponds to the California State, the second column corresponds to the Florida State, and the third column corresponds to the New York State. 

### <font style="color:purple">Note: We should not use all the dummy variables at the same time, so it must be 1 less than the total number of dummy variables, else it will create a dummy variable trap.</font>

### Now, we are writing a single line of code just to avoid the dummy variable trap. Remove the first column

In [4]:
x = x[:,1:]

### Split the data into train & test set

In [5]:
x_train,x_test,y_train,y_test=model_selection.train_test_split(x,y,test_size=.2,random_state=0)

### Create & train the model

In [6]:
regressor = linear_model.LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Prediction

In [7]:
y_pred = regressor.predict(x_test)

### Get some score

In [8]:
print("R2 Score : ",metrics.r2_score(y_pred,y_test))
print("Intercept : ",regressor.intercept_)
print("Coefficients : ",list(zip(x, regressor.coef_)))
print("Error : ",metrics.mean_squared_error(y_pred,y_test))

R2 Score :  0.9293749209319575
Intercept :  42554.1676177278
Coefficients :  [(array([0.0, 1.0, 165349.2, 136897.8, 471784.1], dtype=object), -959.2841600569366), (array([0.0, 0.0, 162597.7, 151377.59, 443898.53], dtype=object), 699.3690525138201), (array([1.0, 0.0, 153441.51, 101145.55, 407934.54], dtype=object), 0.7734671927330901), (array([0.0, 1.0, 144372.41, 118671.85, 383199.62], dtype=object), 0.032884597536124716), (array([1.0, 0.0, 142107.34, 91391.77, 366168.42], dtype=object), 0.0366100258638653)]
Error :  83502864.03247209


### Get the train & test score

In [9]:
print('Train Score: ', regressor.score(x_train, y_train))
print('Test Score: ', regressor.score(x_test, y_test))  

Train Score:  0.9501847627493607
Test Score:  0.9347068473283249


# <span style="color: red">Backward Elimination</span>
## [Click here for more information](https://www.javatpoint.com/backward-elimination-in-machine-learning)

### Backward elimination is a feature selection technique while building a machine learning model. It is used to remove those features that do not have a significant effect on the dependent variable or prediction of output.

### Steps of Backward Elimination

Below are some main steps which are used to apply backward elimination process:

Step-1: Firstly, We need to select a significance level to stay in the model. (SL=0.05)

Step-2: Fit the complete model with all possible predictors/independent variables.

Step-3: Choose the predictor which has the highest P-value, such that.

    a. If P-value >SL, go to step 4.
    b. Else Finish, and Our model is ready.

Step-4: Remove that predictor.

Step-5: Rebuild and fit the model with the remaining variables.

## <span style="color:green">Step-1:</span>

### Import the library

In [10]:
import statsmodels.api as sm

### Adding a extra feature b0 which is constant term. So, we need to add a column x0=1 that will be helpful

In [11]:
x = np.append(arr=np.ones((50,1)).astype(int),values=x,axis=1)
print(x.shape)

(50, 6)


## <span style="color:green">Step-2: Now, we are actually going to apply a backward elimination process.</span>

### We have to use all possible combination of independent features that are significantly affecting the dependent variable.

In [12]:
x_opt=np.array(x[:, [0, 1, 2, 3, 4, 5]], dtype=float)
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Wed, 25 Aug 2021",Prob (F-statistic):,1.34e-27
Time:,22:03:50,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [13]:
x_opt=np.array(x[:, [0,2,3,4,5]],dtype=float)
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()  
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Wed, 25 Aug 2021",Prob (F-statistic):,8.5e-29
Time:,22:03:50,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.018e+04,6747.623,7.437,0.000,3.66e+04,6.38e+04
x1,-136.5042,2801.719,-0.049,0.961,-5779.456,5506.447
x2,0.8059,0.046,17.571,0.000,0.714,0.898
x3,-0.0269,0.052,-0.521,0.605,-0.131,0.077
x4,0.0271,0.017,1.625,0.111,-0.007,0.061

0,1,2,3
Omnibus:,14.892,Durbin-Watson:,1.284
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.665
Skew:,-0.949,Prob(JB):,1.97e-05
Kurtosis:,5.608,Cond. No.,1430000.0


In [14]:
x_opt= np.array(x[:, [0,3,4,5]],dtype=float)  
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()  
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Wed, 25 Aug 2021",Prob (F-statistic):,4.53e-30
Time:,22:03:50,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [15]:
x_opt=np.array(x[:, [0,3,5]],dtype=float)
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()  
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Wed, 25 Aug 2021",Prob (F-statistic):,2.1600000000000003e-31
Time:,22:03:50,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [16]:
x_opt=np.array(x[:, [0,3]],dtype=float)
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()  
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Wed, 25 Aug 2021",Prob (F-statistic):,3.5000000000000004e-32
Time:,22:03:50,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


### After remove some feature, above summary we can see that p value is low means 0. Now we can use this feature for next step

In [17]:
print(x_opt)

[[1.0000000e+00 1.6534920e+05]
 [1.0000000e+00 1.6259770e+05]
 [1.0000000e+00 1.5344151e+05]
 [1.0000000e+00 1.4437241e+05]
 [1.0000000e+00 1.4210734e+05]
 [1.0000000e+00 1.3187690e+05]
 [1.0000000e+00 1.3461546e+05]
 [1.0000000e+00 1.3029813e+05]
 [1.0000000e+00 1.2054252e+05]
 [1.0000000e+00 1.2333488e+05]
 [1.0000000e+00 1.0191308e+05]
 [1.0000000e+00 1.0067196e+05]
 [1.0000000e+00 9.3863750e+04]
 [1.0000000e+00 9.1992390e+04]
 [1.0000000e+00 1.1994324e+05]
 [1.0000000e+00 1.1452361e+05]
 [1.0000000e+00 7.8013110e+04]
 [1.0000000e+00 9.4657160e+04]
 [1.0000000e+00 9.1749160e+04]
 [1.0000000e+00 8.6419700e+04]
 [1.0000000e+00 7.6253860e+04]
 [1.0000000e+00 7.8389470e+04]
 [1.0000000e+00 7.3994560e+04]
 [1.0000000e+00 6.7532530e+04]
 [1.0000000e+00 7.7044010e+04]
 [1.0000000e+00 6.4664710e+04]
 [1.0000000e+00 7.5328870e+04]
 [1.0000000e+00 7.2107600e+04]
 [1.0000000e+00 6.6051520e+04]
 [1.0000000e+00 6.5605480e+04]
 [1.0000000e+00 6.1994480e+04]
 [1.0000000e+00 6.1136380e+04]
 [1.0000

### Below is the code for Building Multiple Linear Regression model by only using R&D spend:

In [18]:
# importing libraries  
import numpy as nm  
import matplotlib.pyplot as mtp  
import pandas as pd   
  
# Import the Independent and dependent Variable
x_BE=x_opt[:,1].reshape(-1,1)
y_BE=y
  
  
# Splitting the dataset into training and test set.  
from sklearn.model_selection import train_test_split  
x_BE_train, x_BE_test, y_BE_train, y_BE_test= train_test_split(x_BE, y_BE, test_size= 0.2, random_state=0)  
  
#Fitting the MLR model to the training set:  
from sklearn.linear_model import LinearRegression  
regressor= LinearRegression()  
regressor.fit(nm.array(x_BE_train).reshape(-1,1), y_BE_train)  
  
#Predicting the Test set result;  
y_BE_pred = regressor.predict(x_BE_test)  


print("R2 Score : ",metrics.r2_score(y_BE_pred,y_BE_test))
print("Intercept : ",regressor.intercept_)
print("Coefficients : ",list(zip(x, regressor.coef_)))
print("Error : ",metrics.mean_squared_error(y_BE_pred,y_BE_test))
  
#Cheking the score  
print('Train Score: ', regressor.score(x_BE_train, y_BE_train))  
print('Test Score: ', regressor.score(x_BE_test, y_BE_test))

R2 Score :  0.9446480449871659
Intercept :  48416.29766138505
Coefficients :  [(array([1, 0.0, 1.0, 165349.2, 136897.8, 471784.1], dtype=object), 0.8516227998779369)]
Error :  68473440.71905932
Train Score:  0.9449589778363044
Test Score:  0.9464587607787219
