# Data Analysis

In [1]:
import pandas as pd
import numpy as np
import pathlib

PROJECT_ROOT = pathlib.Path().parent.parent
DATA_FOLDER = PROJECT_ROOT.joinpath('data')
KAGGLE_CLEANED = DATA_FOLDER.joinpath('kaggle_cleaned.csv')

kaggle_df = pd.read_csv(KAGGLE_CLEANED)

## Spliting data

In [2]:
from sklearn.model_selection import train_test_split

features = kaggle_df.columns[kaggle_df.columns != 'price']
X = kaggle_df[features]
y = kaggle_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

print('Train size: ', len(X_train), 'Test size: ', len(y_test))

Train size:  433 Test size:  109


## Linear Regression

In [None]:
""" Normal Split
This will split the data
"""

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_train_pred = lr_model.predict(X_train)

print("Results for linear regression on training data")
print(' Default settings')
print('Internal Parameters:')
print(' Bias is ', lr_model.intercept_)
print(' Coefficients', lr_model.coef_)
print(' Score', lr_model.score(X_train,y_train))
print()
print('Results for linear regression on train data')
print('MAE is ', mean_absolute_error(y_train,y_train_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_train, y_train_pred)))
print('RSE is ', mean_squared_error(y_train, y_train_pred))
print('R^2 ', r2_score(y_train, y_train_pred))

y_test_pred = lr_model.predict(X_test)
print()
print('Results for linear regression on test data')
print('MAE is ', mean_absolute_error(y_test,y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('RSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2 ', r2_score(y_test, y_test_pred))

Results for linear regression on training data
 Default settings
Internal Parameters:
 Bias is  237206.20674365386
 Coefficients [ 3.07552160e+02  1.12465376e+05  1.02904399e+06  6.40034493e+05
  3.21626815e+05 -3.79562161e+05  3.79562161e+05]
 Score 0.5922244248979526

Results for linear regression on train data
MAE is  842098.3237588818
RMSE is  1136595.8387553697
RSE is  1291850100676.0225
R^2  0.5922244248979526

Results for linear regression on test data
MAE is  950042.4337808738
RMSE is  1227494.4747191283
RSE is  1506742685465.9885
R^2  0.5245574573425862


## Polynomial Regression

In [12]:
from sklearn.preprocessing import PolynomialFeatures

# The degree number
power = 4
poly_process = PolynomialFeatures(degree=power, include_bias=False)

X_train_poly = poly_process.fit_transform(X_train)
X_test_poly = poly_process.fit_transform(X_test)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

y_train_pred = poly_model.predict(X_train_poly)

print("Results for linear regression on training data")
print(f' Power of {power}')
print('Internal Parameters:')
print(' Bias is ', poly_model.intercept_)
# print(' Coefficients', poly_model.coef_)
print(' Score', poly_model.score(X_train_poly,y_train))
print()
print('Results for linear regression on train data')
print('MAE is ', mean_absolute_error(y_train,y_train_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_train, y_train_pred)))
print('RSE is ', mean_squared_error(y_train, y_train_pred))
print('R^2 ', r2_score(y_train, y_train_pred))

y_test_pred = poly_model.predict(X_test_poly)
print()
print('Results for linear regression on test data')
print('MAE is ', mean_absolute_error(y_test,y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('RSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2 ', r2_score(y_test, y_test_pred))

Results for linear regression on training data
 Power of 4
Internal Parameters:
 Bias is  1656080.1882410534
 Score 0.7135284843526224

Results for linear regression on train data
MAE is  710234.5472299962
RMSE is  952656.1603112257
RSE is  907553759778.9277
R^2  0.7135284843526224

Results for linear regression on test data
MAE is  882815.4846795333
RMSE is  1172812.5496438763
RSE is  1375489276602.1697
R^2  0.5659735896687024


# Desicion Tree Regressor

In [14]:
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(max_depth=3)
tree_model.fit(X_train,y_train)

y_train_pred = tree_model.predict(X_train)
print('Results for Decision Tree on training data')
print('RMSE is ', np.sqrt(mean_squared_error(y_train, y_train_pred)))
print('RSE is ', mean_squared_error(y_train, y_train_pred))
print('R^2 ', r2_score(y_train, y_train_pred))

y_test_pred = tree_model.predict(X_test)
print()
print('Results for linear regression on test data')
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('RSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2 ', r2_score(y_test, y_test_pred))

Results for Decision Tree on training data
RMSE is  1222539.1861642832
RSE is  1494602061707.2278
R^2  0.5282252833030394

Results for linear regression on test data
RMSE is  1256334.2066498818
RSE is  1578375638798.5881
R^2  0.5019541596468167


## Random Forest Regressor

In [15]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(max_depth=4, n_estimators=100, random_state=123)
rf_model.fit(X_train,y_train)

y_train_pred = rf_model.predict(X_train)

print('Results for linear regression on train data')
print('RMSE is ', np.sqrt(mean_squared_error(y_train, y_train_pred)))
print('RSE is ', mean_squared_error(y_train, y_train_pred))
print('R^2 ', r2_score(y_train, y_train_pred))

y_test_pred = rf_model.predict(X_test)
print()
print('Results for linear regression on test data')
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('RSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2 ', r2_score(y_test, y_test_pred))

Results for linear regression on train data
RMSE is  1027976.0632251246
RSE is  1056734786563.8254
R^2  0.6664391363240142

Results for linear regression on test data
RMSE is  1172175.0299036687
RSE is  1373994300729.6667
R^2  0.5664453192724963


## Results
| Model | Type | RMSE | R^2 |
| --- | --- | --- | --- |
| Linear Reg | Train | 1136595 | 0.5922 |
| Linear Reg | Test | 1227494 | 0.5246 |
| Poly Reg Deg 2 | Train | 1067841 | 0.6401 |
| Poly Reg Deg 2 | Test | 1194405 | 0.5498 |
| Poly Reg Deg 3 | Train | 964012 | 0.7067 |
| Poly Reg Deg 3 | Test | 1309440 | 0.4600 |
| Poly Reg Deg 4 | Train | 952656 |  0.7135 |
| Poly Reg Deg 4 | Test | 1172812 | 0.5660 |
| Descision Tree Depth 3 | Train | 1222539 | 0.5282 |
| Descision Tree Depth 3 | Test | 1256334 | 0.5020 |
| Random Forest Depth 3 Est 100 | Train | 1027976 | 0.6664 |
| Random Forest Depth 3 Est 100 | Test | 1172175 | 0.5664 |