# Importing libararies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Loading the dataset

In [2]:
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Vehicle Size,Vehicle Style,highway MPG,city mpg,MSRP
0,2011,335.0,6.0,0,0,2.0,0,0,26,19,46135
1,2011,300.0,6.0,0,0,2.0,0,1,28,19,40650
2,2011,300.0,6.0,0,0,2.0,0,0,28,20,36350
3,2011,230.0,6.0,0,0,2.0,0,0,28,18,29450
4,2011,230.0,6.0,0,0,2.0,0,1,28,18,34500


# Dataset information

### 1) Displaying rows and columns 

In [3]:
data.shape

(11914, 11)

### 2) Displaying the column names

In [4]:
data.columns

Index(['Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type',
       'Driven_Wheels', 'Number of Doors', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'MSRP'],
      dtype='object')

### 3) Displaying data type of each column  

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               11914 non-null  int64  
 1   Engine HP          11845 non-null  float64
 2   Engine Cylinders   11884 non-null  float64
 3   Transmission Type  11914 non-null  int64  
 4   Driven_Wheels      11914 non-null  int64  
 5   Number of Doors    11908 non-null  float64
 6   Vehicle Size       11914 non-null  int64  
 7   Vehicle Style      11914 non-null  int64  
 8   highway MPG        11914 non-null  int64  
 9   city mpg           11914 non-null  int64  
 10  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(8)
memory usage: 1024.0 KB


### 4) displaying the number of null values

In [6]:
data.isnull().sum()

Year                  0
Engine HP            69
Engine Cylinders     30
Transmission Type     0
Driven_Wheels         0
Number of Doors       6
Vehicle Size          0
Vehicle Style         0
highway MPG           0
city mpg              0
MSRP                  0
dtype: int64

### Dropping Null Values

In [7]:
data=data.dropna() #drop rows with atleast a column with missing values


In [8]:
data.isnull().sum()

Year                 0
Engine HP            0
Engine Cylinders     0
Transmission Type    0
Driven_Wheels        0
Number of Doors      0
Vehicle Size         0
Vehicle Style        0
highway MPG          0
city mpg             0
MSRP                 0
dtype: int64

### 5) Some stats on the dataset

In [9]:
data.describe()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Vehicle Size,Vehicle Style,highway MPG,city mpg,MSRP
count,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0
mean,2010.359966,249.480491,5.649513,0.858146,1.152433,3.43267,0.834109,4.570631,26.320609,19.32755,40554.37
std,7.594359,109.203463,1.751433,0.664606,0.965569,0.882853,0.776913,3.467384,7.442674,6.52701,60277.5
min,1990.0,55.0,0.0,0.0,0.0,2.0,0.0,0.0,12.0,7.0,2000.0
25%,2007.0,170.0,4.0,1.0,0.0,2.0,0.0,2.0,22.0,16.0,20990.0
50%,2015.0,227.0,6.0,1.0,1.0,4.0,1.0,4.0,26.0,18.0,29960.0
75%,2016.0,300.0,6.0,1.0,2.0,4.0,1.0,6.0,30.0,22.0,42200.0
max,2017.0,1001.0,16.0,3.0,3.0,4.0,2.0,15.0,354.0,137.0,2065902.0


### 6) Checking for duplicate values

In [10]:
print(data[data.duplicated()])

       Year  Engine HP  Engine Cylinders  Transmission Type  Driven_Wheels  \
14     2013      230.0               6.0                  0              0   
18     1992      172.0               6.0                  0              1   
20     1992      172.0               6.0                  0              1   
24     1993      172.0               6.0                  0              1   
25     1993      172.0               6.0                  0              1   
...     ...        ...               ...                ...            ...   
11481  1998       95.0               4.0                  0              3   
11603  2017      302.0               4.0                  1              2   
11604  2017      240.0               4.0                  1              1   
11708  2008      252.0               6.0                  1              2   
11717  2008      252.0               6.0                  1              1   

       Number of Doors  Vehicle Size  Vehicle Style  highway MP

### 7.) Checking the correlation of attributes with each other

In [11]:
data.corr()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Vehicle Size,Vehicle Style,highway MPG,city mpg,MSRP
Year,1.0,0.35298,-0.034494,0.316264,0.098368,0.262265,0.086846,-0.063135,0.28021,0.232439,0.227381
Engine HP,0.35298,1.0,0.77974,0.239824,-0.010711,-0.10207,0.397373,-0.12747,-0.414183,-0.465903,0.661833
Engine Cylinders,-0.034494,0.77974,1.0,0.109557,-0.02982,-0.137584,0.485464,0.087225,-0.620316,-0.637879,0.543971
Transmission Type,0.316264,0.239824,0.109557,1.0,0.066674,0.177031,0.140977,-0.053247,0.052832,0.044509,0.24188
Driven_Wheels,0.098368,-0.010711,-0.02982,0.066674,1.0,0.23501,0.082344,0.193519,-0.090888,-0.042024,-0.031782
Number of Doors,0.262265,-0.10207,-0.137584,0.177031,0.23501,1.0,0.246079,0.154347,0.120925,0.136574,-0.127367
Vehicle Size,0.086846,0.397373,0.485464,0.140977,0.082344,0.246079,1.0,0.194891,-0.326246,-0.346916,0.117891
Vehicle Style,-0.063135,-0.12747,0.087225,-0.053247,0.193519,0.154347,0.194891,1.0,-0.366084,-0.23367,-0.206249
highway MPG,0.28021,-0.414183,-0.620316,0.052832,-0.090888,0.120925,-0.326246,-0.366084,1.0,0.847022,-0.198942
city mpg,0.232439,-0.465903,-0.637879,0.044509,-0.042024,0.136574,-0.346916,-0.23367,0.847022,1.0,-0.225277


# Aplying Machine Learning Techniques

In [12]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

### Splitting data into train and test set

In [13]:
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)

### Feature scaling

In [15]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

### Logistic Regression

In [16]:
# lr = LogisticRegression()
# lr.fit(x_train, y_train)

### Decision tree

In [17]:
from sklearn.tree import DecisionTreeRegressor

In [18]:
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)

In [20]:
dt = DecisionTreeRegressor(random_state = 0)
dt = dt.fit(x_train, y_train)

In [21]:
y_pred = dt.predict(x_test)

In [22]:
import statsmodels.api as sm
x = np.append(arr = np.ones((11815,1)).astype(int), values = x, axis = 1)
x_opt = x[:, [0,1,2,3,4,5,6,7,8,9]]
regressor_OLS =sm.OLS(endog =y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.501
Method:,Least Squares,F-statistic:,1320.0
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,0.0
Time:,09:27:07,Log-Likelihood:,-142700.0
No. Observations:,11815,AIC:,285400.0
Df Residuals:,11805,BIC:,285500.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.708e+05,1.51e+05,-2.451,0.014,-6.67e+05,-7.43e+04
x1,138.2873,76.088,1.817,0.069,-10.859,287.433
x2,273.1315,8.078,33.813,0.000,257.298,288.965
x3,1.049e+04,474.878,22.083,0.000,9555.780,1.14e+04
x4,9294.7688,636.158,14.611,0.000,8047.795,1.05e+04
x5,1185.9253,427.279,2.776,0.006,348.387,2023.464
x6,-731.0375,516.955,-1.414,0.157,-1744.355,282.280
x7,-1.504e+04,625.546,-24.036,0.000,-1.63e+04,-1.38e+04
x8,-1643.3951,139.470,-11.783,0.000,-1916.780,-1370.010

0,1,2,3
Omnibus:,23779.208,Durbin-Watson:,0.81
Prob(Omnibus):,0.0,Jarque-Bera (JB):,121788425.982
Skew:,16.3,Prob(JB):,0.0
Kurtosis:,499.314,Cond. No.,782000.0


### Linear regression

In [23]:
from sklearn.linear_model import LinearRegression
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)
regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression()

In [24]:
y_pred = regressor.predict(x_test)

In [25]:
data.shape

(11815, 11)

In [26]:
import statsmodels.api as sm
x = np.append(arr = np.ones((11815,1)).astype(int), values = x, axis = 1)
x_opt = x[:, [0,1,2,3,4,5,6,7,8,9]]
regressor_OLS =sm.OLS(endog =y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.501
Method:,Least Squares,F-statistic:,1320.0
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,0.0
Time:,09:27:28,Log-Likelihood:,-142700.0
No. Observations:,11815,AIC:,285400.0
Df Residuals:,11805,BIC:,285500.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.708e+05,1.51e+05,-2.451,0.014,-6.67e+05,-7.43e+04
x1,138.2873,76.088,1.817,0.069,-10.859,287.433
x2,273.1315,8.078,33.813,0.000,257.298,288.965
x3,1.049e+04,474.878,22.083,0.000,9555.780,1.14e+04
x4,9294.7688,636.158,14.611,0.000,8047.795,1.05e+04
x5,1185.9253,427.279,2.776,0.006,348.387,2023.464
x6,-731.0375,516.955,-1.414,0.157,-1744.355,282.280
x7,-1.504e+04,625.546,-24.036,0.000,-1.63e+04,-1.38e+04
x8,-1643.3951,139.470,-11.783,0.000,-1916.780,-1370.010

0,1,2,3
Omnibus:,23779.208,Durbin-Watson:,0.81
Prob(Omnibus):,0.0,Jarque-Bera (JB):,121788425.982
Skew:,16.3,Prob(JB):,0.0
Kurtosis:,499.314,Cond. No.,782000.0


In [27]:
x_opt = x[:, [0,1,2,3,4,5,6,8,9]]
regressor_OLS =sm.OLS(endog =y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.477
Model:,OLS,Adj. R-squared:,0.477
Method:,Least Squares,F-statistic:,1347.0
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,0.0
Time:,09:27:31,Log-Likelihood:,-142980.0
No. Observations:,11815,AIC:,286000.0
Df Residuals:,11806,BIC:,286000.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.927e+05,1.55e+05,-2.535,0.011,-6.96e+05,-8.91e+04
x1,160.0486,77.919,2.054,0.040,7.314,312.783
x2,261.3428,8.257,31.650,0.000,245.157,277.529
x3,7672.4597,471.326,16.278,0.000,6748.584,8596.336
x4,8814.0164,651.189,13.535,0.000,7537.579,1.01e+04
x5,1166.8096,437.591,2.666,0.008,309.059,2024.560
x6,-4640.0749,502.550,-9.233,0.000,-5625.156,-3654.994
x7,-2062.7034,141.715,-14.555,0.000,-2340.488,-1784.919
x8,738.1704,82.922,8.902,0.000,575.629,900.711

0,1,2,3
Omnibus:,23775.107,Durbin-Watson:,0.759
Prob(Omnibus):,0.0,Jarque-Bera (JB):,121258967.612
Skew:,16.295,Prob(JB):,0.0
Kurtosis:,498.23,Cond. No.,782000.0


In [28]:
x_opt = x[:, [0,1,2,3,4,5,8,9]]
regressor_OLS =sm.OLS(endog =y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.473
Model:,OLS,Adj. R-squared:,0.473
Method:,Least Squares,F-statistic:,1516.0
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,0.0
Time:,09:27:36,Log-Likelihood:,-143020.0
No. Observations:,11815,AIC:,286100.0
Df Residuals:,11807,BIC:,286100.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6.849e+04,1.51e+05,-0.452,0.651,-3.65e+05,2.28e+05
x1,-8.6822,76.016,-0.114,0.909,-157.686,140.321
x2,272.4844,8.198,33.239,0.000,256.415,288.553
x3,7441.7253,472.339,15.755,0.000,6515.863,8367.587
x4,8003.6906,647.545,12.360,0.000,6734.395,9272.987
x5,419.3515,431.569,0.972,0.331,-426.595,1265.298
x6,-2190.1332,141.543,-15.473,0.000,-2467.582,-1912.685
x7,727.1606,83.209,8.739,0.000,564.058,890.264

0,1,2,3
Omnibus:,23677.669,Durbin-Watson:,0.756
Prob(Omnibus):,0.0,Jarque-Bera (JB):,118127411.843
Skew:,16.146,Prob(JB):,0.0
Kurtosis:,491.786,Cond. No.,762000.0


In [29]:
from sklearn.linear_model import LinearRegression
x = data.iloc[:, [0,1,2,3,4,5,8,9]].values
y = data.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)

In [30]:
regressor = LinearRegression()
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

#### checking accuacy of Linear regression

In [31]:
from sklearn.metrics import mean_squared_error, r2_score
r2score = r2_score(y_test, y_pred)

In [32]:
print(r2score)

0.41437412891514536


## Polynomial regression

In [33]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.model_selection import train_test_split
# x = data.iloc[:, :-1].values
# y = data.iloc[:,-1].values
# x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)
# polynomial_features = PolynomialFeatures(degree = 3)

In [34]:
# x_poly = polynomial_features.fit_transform(x_train)

In [35]:
# regressor1 = LinearRegression()
# regressor1.fit(x_poly,y_train)

In [36]:
# y_pred = dt.predict(x_test)
# from sklearn.metrics import mean_squared_error, r2_score
# r2score = r2_score(y_test, y_pred)
# print(r2score)

## kNN

In [74]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [75]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors = 5, metric = 'minkowski', p=2)

In [76]:
regressor.fit(x_train, y_train)
KNeighborsRegressor()

KNeighborsRegressor()

In [77]:
y_pred = regressor.predict(sc.transform(x_test))

In [78]:
import statsmodels.api as sm
x = np.append(arr = np.ones((11815,1)).astype(int), values = x, axis = 1)
x_opt = x[:, [0,1,2,3,4,5,6,7,8,9]]
regressor_OLS =sm.OLS(endog =y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.501
Method:,Least Squares,F-statistic:,1320.0
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,0.0
Time:,09:59:04,Log-Likelihood:,-142700.0
No. Observations:,11815,AIC:,285400.0
Df Residuals:,11805,BIC:,285500.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.708e+05,1.51e+05,-2.451,0.014,-6.67e+05,-7.43e+04
x1,138.2873,76.088,1.817,0.069,-10.859,287.433
x2,273.1315,8.078,33.813,0.000,257.298,288.965
x3,1.049e+04,474.878,22.083,0.000,9555.780,1.14e+04
x4,9294.7688,636.158,14.611,0.000,8047.795,1.05e+04
x5,1185.9253,427.279,2.776,0.006,348.387,2023.464
x6,-731.0375,516.955,-1.414,0.157,-1744.355,282.280
x7,-1.504e+04,625.546,-24.036,0.000,-1.63e+04,-1.38e+04
x8,-1643.3951,139.470,-11.783,0.000,-1916.780,-1370.010

0,1,2,3
Omnibus:,23779.208,Durbin-Watson:,0.81
Prob(Omnibus):,0.0,Jarque-Bera (JB):,121788425.982
Skew:,16.3,Prob(JB):,0.0
Kurtosis:,499.314,Cond. No.,782000.0


### accuracy

In [79]:
from sklearn.metrics import accuracy_score

## SVM

In [55]:
from sklearn.svm import SVR

In [56]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
regressor = SVR(kernel = 'rbf')
regressor.fit(x_train, y_train)

SVR()

In [57]:
y_pred = regressor.predict(x_test)

In [58]:
import statsmodels.api as sm
x = np.append(arr = np.ones((11815,1)).astype(int), values = x, axis = 1)
x_opt = x[:, [0,1,2,3,4,5,6,7,8,9]]
regressor_OLS =sm.OLS(endog =y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.501
Method:,Least Squares,F-statistic:,1320.0
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,0.0
Time:,09:51:04,Log-Likelihood:,-142700.0
No. Observations:,11815,AIC:,285400.0
Df Residuals:,11805,BIC:,285500.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.708e+05,1.51e+05,-2.451,0.014,-6.67e+05,-7.43e+04
x1,138.2873,76.088,1.817,0.069,-10.859,287.433
x2,273.1315,8.078,33.813,0.000,257.298,288.965
x3,1.049e+04,474.878,22.083,0.000,9555.780,1.14e+04
x4,9294.7688,636.158,14.611,0.000,8047.795,1.05e+04
x5,1185.9253,427.279,2.776,0.006,348.387,2023.464
x6,-731.0375,516.955,-1.414,0.157,-1744.355,282.280
x7,-1.504e+04,625.546,-24.036,0.000,-1.63e+04,-1.38e+04
x8,-1643.3951,139.470,-11.783,0.000,-1916.780,-1370.010

0,1,2,3
Omnibus:,23779.208,Durbin-Watson:,0.81
Prob(Omnibus):,0.0,Jarque-Bera (JB):,121788425.982
Skew:,16.3,Prob(JB):,0.0
Kurtosis:,499.314,Cond. No.,782000.0


In [59]:
x_opt = x[:, [0,1,2,3,4,5,7,8,9]]
regressor_OLS =sm.OLS(endog =y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.501
Model:,OLS,Adj. R-squared:,0.501
Method:,Least Squares,F-statistic:,1484.0
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,0.0
Time:,09:52:25,Log-Likelihood:,-142700.0
No. Observations:,11815,AIC:,285400.0
Df Residuals:,11806,BIC:,285500.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.244e+05,1.48e+05,-2.197,0.028,-6.14e+05,-3.49e+04
x1,113.9322,74.117,1.537,0.124,-31.349,259.213
x2,274.9314,7.977,34.465,0.000,259.295,290.568
x3,1.051e+04,474.701,22.132,0.000,9575.459,1.14e+04
x4,9188.6363,631.742,14.545,0.000,7950.318,1.04e+04
x5,1080.1730,420.702,2.568,0.010,255.528,1904.818
x6,-1.531e+04,593.810,-25.789,0.000,-1.65e+04,-1.42e+04
x7,-1653.7236,139.285,-11.873,0.000,-1926.745,-1380.702
x8,726.4319,80.963,8.972,0.000,567.731,885.133

0,1,2,3
Omnibus:,23764.538,Durbin-Watson:,0.811
Prob(Omnibus):,0.0,Jarque-Bera (JB):,121316314.178
Skew:,16.277,Prob(JB):,0.0
Kurtosis:,498.35,Cond. No.,764000.0


In [60]:
x_opt = x[:, [0,2,3,4,5,7,8,9]]
regressor_OLS =sm.OLS(endog =y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.501
Model:,OLS,Adj. R-squared:,0.501
Method:,Least Squares,F-statistic:,1696.0
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,0.0
Time:,09:53:03,Log-Likelihood:,-142700.0
No. Observations:,11815,AIC:,285400.0
Df Residuals:,11807,BIC:,285500.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.745e+04,3574.159,-27.266,0.000,-1.04e+05,-9.04e+04
x1,282.5605,6.246,45.242,0.000,270.318,294.803
x2,1.022e+04,436.740,23.400,0.000,9363.845,1.11e+04
x3,9354.5557,622.489,15.028,0.000,8134.375,1.06e+04
x4,1142.1122,418.792,2.727,0.006,321.210,1963.014
x5,-1.526e+04,592.621,-25.742,0.000,-1.64e+04,-1.41e+04
x6,-1591.3428,133.250,-11.943,0.000,-1852.535,-1330.151
x7,776.1994,74.212,10.459,0.000,630.731,921.668

0,1,2,3
Omnibus:,23721.717,Durbin-Watson:,0.812
Prob(Omnibus):,0.0,Jarque-Bera (JB):,120066865.2
Skew:,16.211,Prob(JB):,0.0
Kurtosis:,495.791,Cond. No.,2500.0


## RANDOM FOREST

In [94]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)
#sc = StandardScaler()
#x_train = sc.fit_transform(x_train)
#x_test = sc.transform(x_test)
regressor = RandomForestRegressor(n_estimators = 20, random_state = 0)
regressor.fit(x_train, y_train)

RandomForestRegressor(n_estimators=20, random_state=0)

In [95]:
import statsmodels.api as sm
x = np.append(arr = np.ones((11815,1)).astype(int), values = x, axis = 1)
x_opt = x[:, [0,1,2,3,4,5,6,7,8,9]]
regressor_OLS =sm.OLS(endog =y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.501
Method:,Least Squares,F-statistic:,1320.0
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,0.0
Time:,10:03:46,Log-Likelihood:,-142700.0
No. Observations:,11815,AIC:,285400.0
Df Residuals:,11805,BIC:,285500.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.708e+05,1.51e+05,-2.451,0.014,-6.67e+05,-7.43e+04
x1,138.2873,76.088,1.817,0.069,-10.859,287.433
x2,273.1315,8.078,33.813,0.000,257.298,288.965
x3,1.049e+04,474.878,22.083,0.000,9555.780,1.14e+04
x4,9294.7688,636.158,14.611,0.000,8047.795,1.05e+04
x5,1185.9253,427.279,2.776,0.006,348.387,2023.464
x6,-731.0375,516.955,-1.414,0.157,-1744.355,282.280
x7,-1.504e+04,625.546,-24.036,0.000,-1.63e+04,-1.38e+04
x8,-1643.3951,139.470,-11.783,0.000,-1916.780,-1370.010

0,1,2,3
Omnibus:,23779.208,Durbin-Watson:,0.81
Prob(Omnibus):,0.0,Jarque-Bera (JB):,121788425.982
Skew:,16.3,Prob(JB):,0.0
Kurtosis:,499.314,Cond. No.,782000.0


In [96]:
x_opt = x[:, [0,1,2,3,4,5,7,8,9]]
regressor_OLS =sm.OLS(endog =y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.501
Model:,OLS,Adj. R-squared:,0.501
Method:,Least Squares,F-statistic:,1484.0
Date:,"Sun, 29 Nov 2020",Prob (F-statistic):,0.0
Time:,10:03:58,Log-Likelihood:,-142700.0
No. Observations:,11815,AIC:,285400.0
Df Residuals:,11806,BIC:,285500.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.244e+05,1.48e+05,-2.197,0.028,-6.14e+05,-3.49e+04
x1,113.9322,74.117,1.537,0.124,-31.349,259.213
x2,274.9314,7.977,34.465,0.000,259.295,290.568
x3,1.051e+04,474.701,22.132,0.000,9575.459,1.14e+04
x4,9188.6363,631.742,14.545,0.000,7950.318,1.04e+04
x5,1080.1730,420.702,2.568,0.010,255.528,1904.818
x6,-1.531e+04,593.810,-25.789,0.000,-1.65e+04,-1.42e+04
x7,-1653.7236,139.285,-11.873,0.000,-1926.745,-1380.702
x8,726.4319,80.963,8.972,0.000,567.731,885.133

0,1,2,3
Omnibus:,23764.538,Durbin-Watson:,0.811
Prob(Omnibus):,0.0,Jarque-Bera (JB):,121316314.178
Skew:,16.277,Prob(JB):,0.0
Kurtosis:,498.35,Cond. No.,764000.0
