In [2]:
import numpy as np
import pandas as pd
import sklearn
from matplotlib import pyplot as plt


In [5]:
df = pd.read_csv("../train1_clustered.csv")
df_test = pd.read_csv("../test1.csv")

In [9]:
from sklearn.linear_model import LinearRegression
## Linear Regression
X_train = df[['pca_component 0', 'pca_component 1', 'pca_component 2', 'pca_component 3', 'pca_component 4', "pca_component 5", "pca_component 6", "pca_component 7", "pca_component 8", "pca_component 9", "pca_component 10", "cluster"]]
y_train = df['price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.1,random_state=105)

# Fitting linear regression
reg = LinearRegression().fit(X_train, y_train)

# R squared value
print('reg score: ', reg.score(X_train, y_train))

# Coefficients
print('reg coef: ', reg.coef_)

# Intercept
print('reg intercept: ', reg.intercept_)


Coeff = pd.DataFrame(columns=["Variable","Coefficient"])
Coeff["Variable"]=X_train.columns
Coeff["Coefficient"]=reg.coef_
Coeff.sort_values("Coefficient")

reg score:  0.44104088856681445
reg coef:  [  0.22600106  -0.13419158  -0.11489242   0.32189021  -0.12944287
   0.13447206 -11.87965683  69.7537505    6.40873734  10.14263389
 -27.23889674 -11.70034992]
reg intercept:  118.90442857368171


Unnamed: 0,Variable,Coefficient
10,pca_component 10,-27.238897
6,pca_component 6,-11.879657
11,cluster,-11.70035
1,pca_component 1,-0.134192
4,pca_component 4,-0.129443
2,pca_component 2,-0.114892
5,pca_component 5,0.134472
0,pca_component 0,0.226001
3,pca_component 3,0.32189
8,pca_component 8,6.408737


In [10]:
# Predicting 
y_pred = reg.predict(X_test)

# Calculate RMSE
from sklearn import metrics
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("root mean squared error: ", rmse)
mae = metrics.mean_absolute_error(y_test, y_pred)
print("mean absolute erroe: ", mae)
msle = metrics.mean_squared_log_error(y_test, y_pred)
print("mean squared log error: ", msle)
r2 = metrics.r2_score(y_test, y_pred)
print("R^2 score: ", r2)

root mean squared error:  40.52493272076187
mean absolute erroe:  31.62332296925668
mean squared log error:  0.16935136508876153
R^2 score:  0.4456953769078621


In [6]:
# Taking a closer look at the estimates
import statsmodels.api as sm

X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.439
Model:                            OLS   Adj. R-squared:                  0.439
Method:                 Least Squares   F-statistic:                     2430.
Date:                Sat, 05 Dec 2020   Prob (F-statistic):               0.00
Time:                        18:29:09   Log-Likelihood:            -1.7463e+05
No. Observations:               34135   AIC:                         3.493e+05
Df Residuals:                   34123   BIC:                         3.494e+05
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const              107.9940      0.218  

In [19]:
from sklearn.preprocessing import PolynomialFeatures

# Polynomial Regression
poly = PolynomialFeatures(degree = 3) 
X_poly = poly.fit_transform(X_train) 
  
poly.fit(X_poly, y_train) 
lin = LinearRegression() 
lin.fit(X_poly, y_train) 

# R squared value
print('polyreg score: ', lin.score(X_poly, y_train))

# Coefficients
print('polyreg coef: ', lin.coef_)

# Intercept
print('polyreg intercept: ', lin.intercept_)

polyreg score:  0.6387816102352926
polyreg coef:  [ 6.40215590e-01 -2.46984001e+01 -4.60903074e+02  3.43696338e+03
 -2.27663098e+03  6.77424892e+03  1.54156737e+04 -2.11645401e+04
 -1.78367552e+04 -6.46718095e+03 -4.24166064e+02  3.78055679e+05
  7.67981496e-02  4.31341080e+00 -1.20366530e+00  1.19711256e+00
 -4.53514419e+00  1.95817195e+01 -2.81456214e+01  1.79588941e+01
 -1.54136842e+01  1.61100215e+01  1.48526697e+02  1.12817240e+02
  3.94569539e+01 -4.48343960e+00  5.13112679e+01  5.37853798e+02
 -5.73300952e+02 -3.06194051e+02 -7.10419233e+02  4.81192214e+02
  5.48291459e+03  2.40729794e+01 -3.30767778e+01  1.59400312e+02
 -5.62952356e+02  9.30403226e+02  7.30617448e+01 -1.90218318e+02
 -1.39993397e+03  3.21810839e+02  9.53852606e+00 -1.03037987e+02
  3.62701362e+02 -6.16862180e+02 -5.78962883e+01  1.11493816e+02
  3.01426958e+02  1.30002029e+03  2.35528151e+02 -1.30168795e+03
  2.12427816e+03 -4.29471987e+02 -1.29392285e+02 -1.97100174e+03
 -1.65727256e+02 -1.68050568e+01  1.3018

In [21]:
# Calculate RMSE
poly_y_pred = lin.predict(poly.fit_transform(X_train))
poly_rmse = np.sqrt(metrics.mean_squared_error(y_train, poly_y_pred))
print("root mean squared error: ", poly_rmse)
poly_mae = metrics.mean_absolute_error(y_train, poly_y_pred)
print("mean absolute error: ", poly_mae)
# poly_msle = metrics.mean_squared_log_error(y_train, poly_y_pred)
# print("mean squared log error: ", poly_msle)
poly_r2 = metrics.r2_score(y_train, poly_y_pred)
print("R^2 score: ", poly_r2)

root mean squared error:  31.75606417810297
mean absolute error:  24.133074090877386
R^2 score:  0.6387816102352926


In [12]:
# Taking a closer look at the estimates
X3 = sm.add_constant(X_poly)
est3 = sm.OLS(y_train, X3)
est4 = est3.fit()
print(est4.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.483
Model:                            OLS   Adj. R-squared:                  0.478
Method:                 Least Squares   F-statistic:                     86.99
Date:                Sat, 05 Dec 2020   Prob (F-statistic):               0.00
Time:                        18:29:26   Log-Likelihood:            -1.7324e+05
No. Observations:               34135   AIC:                         3.472e+05
Df Residuals:                   33771   BIC:                         3.503e+05
Df Model:                         363                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        387.3299   1801.401      0.215      0.8