## Imports and Data Loading

In [81]:
import pandas as pd
import numpy as np
# Linear regression
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [82]:
data_preprocessed = pd.read_csv('./data/1_Preprocessing/train.csv')
data_not_preprocessed = pd.read_csv('./data/0_Data_Split/train.csv')

#### Multiple Subdatasets

In [83]:
# NaN values
print(data_preprocessed.isnull().sum())
data_preprocessed.head()

model_year        0
milage            0
accident          0
clean_title       0
price             0
horsepower        0
turbo             0
is_luxury         0
is_upper_class    0
dtype: int64


Unnamed: 0,model_year,milage,accident,clean_title,price,horsepower,turbo,is_luxury,is_upper_class
0,2000,194277.0,1,1,2300,222.0,0,0,0
1,2015,13300.0,0,1,62500,449.0,0,0,0
2,2020,30426.0,1,0,29645,256.333333,0,0,0
3,2020,67072.0,0,1,38500,375.0,0,0,0
4,2016,99000.0,1,1,5000,172.0,0,0,0


In [84]:
# Without preprocessing
print(data_not_preprocessed.isnull().sum())
data_not_preprocessed.head()

brand             0
model             0
model_year        0
milage            0
fuel_type       135
engine            0
transmission      0
ext_col           0
int_col           0
accident         87
clean_title     472
price             0
dtype: int64


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Nissan,Maxima GLE,2000,"194,277 mi.",Gasoline,222.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,A/T,Blue,Beige,At least 1 accident or damage reported,Yes,2300
1,Mercedes-Benz,S-Class S 550 4MATIC,2015,"13,300 mi.",Gasoline,449.0HP 4.7L 8 Cylinder Engine Gasoline Fuel,A/T,Black,Black,None reported,Yes,62500
2,GMC,Acadia SLE,2020,"30,426 mi.",Gasoline,2.5L I4 16V GDI DOHC,9-Speed Automatic,Ebony Twilight Metallic,Jet Black,,,29645
3,Ford,Expedition Max XLT,2020,"67,072 mi.",Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,Purple,Black,None reported,Yes,38500
4,Jeep,Compass Sport,2016,"99,000 mi.",Gasoline,172.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Gray,At least 1 accident or damage reported,Yes,5000


In [85]:
# Complete case
data_not_preprocessed_cc = data_not_preprocessed.dropna()
print("The new shape of the data is: ", data_not_preprocessed_cc.shape)
data_not_preprocessed_cc.isnull().sum()

The new shape of the data is:  (2622, 12)


brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [86]:
# How often is "" or "-" in the data?
print(data_not_preprocessed_cc[data_not_preprocessed_cc == ""].count())
print(data_not_preprocessed_cc[data_not_preprocessed_cc == "-"].count())
print(data_not_preprocessed_cc[data_not_preprocessed_cc == "not supported"].count())



brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64
brand           0
model           0
model_year      0
milage          0
fuel_type       1
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64


In [87]:
data_not_preprocessed_cc.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Nissan,Maxima GLE,2000,"194,277 mi.",Gasoline,222.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,A/T,Blue,Beige,At least 1 accident or damage reported,Yes,2300
1,Mercedes-Benz,S-Class S 550 4MATIC,2015,"13,300 mi.",Gasoline,449.0HP 4.7L 8 Cylinder Engine Gasoline Fuel,A/T,Black,Black,None reported,Yes,62500
3,Ford,Expedition Max XLT,2020,"67,072 mi.",Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,Purple,Black,None reported,Yes,38500
4,Jeep,Compass Sport,2016,"99,000 mi.",Gasoline,172.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Gray,At least 1 accident or damage reported,Yes,5000
7,Toyota,4Runner Limited,2014,"146,700 mi.",Gasoline,270.0HP 4.0L V6 Cylinder Engine Gasoline Fuel,5-Speed A/T,Blue,Black,At least 1 accident or damage reported,Yes,24999


In [88]:
# Most common ext_col and int_col
print(data_not_preprocessed_cc['ext_col'].value_counts())
print(data_not_preprocessed_cc['int_col'].value_counts())

ext_col
Black                      633
White                      585
Gray                       360
Silver                     271
Blue                       247
                          ... 
Glacier Silver Metallic      1
Carrara White Metallic       1
Blue Reflex Mica             1
Shadow Gray Metallic         1
Rich Garnet Metallic         1
Name: count, Length: 98, dtype: int64
int_col
Black            1347
Beige             397
Gray              349
Brown             133
–                  84
                 ... 
Light Slate         1
Medium Pewter       1
Graphite            1
White / Brown       1
Chestnut            1
Name: count, Length: 63, dtype: int64


In [89]:
len(data_not_preprocessed_cc["brand"].unique())

52

In [90]:
# Little Preprocessing of original data

# Tranformation of milage (String to int)
data_not_preprocessed['milage'] = data_not_preprocessed['milage'].str.replace(' mi.', '')
data_not_preprocessed['milage'] = data_not_preprocessed['milage'].str.replace(',', '')

# Dummy variables

def transform_string(series):
    '''
    Function to transform a series of strings into column names
    ----------------
    Parameters:
        series(pd.Series): Series of strings to transform
    ----------------
    Returns:
        pd.DataFrame: DataFrame with dummy variables
    '''
    series = series.str.replace(' ', '_')
    series = series.str.replace('-', '_')
    series = series.str.replace('/', '_')
    print(series)
    return pd.get_dummies(series)

## fuel_type should be a dummy variable
dummy_placeholder = pd.get_dummies(data_not_preprocessed_cc['fuel_type'])
data_not_preprocessed_cc = pd.concat([data_not_preprocessed_cc, dummy_placeholder], axis=1)
data_not_preprocessed_cc.drop('fuel_type', axis=1, inplace=True)
data_not_preprocessed_cc.drop(['not supported', '–'], axis=1, inplace=True)

# Dummy if ext_col is Black, White, Grey, Silver
data_not_preprocessed_cc['is_black'] = data_not_preprocessed_cc['ext_col'].apply(lambda x: 1 if x == 'Black' else 0)
data_not_preprocessed_cc['is_white'] = data_not_preprocessed_cc['ext_col'].apply(lambda x: 1 if x == 'White' else 0)
data_not_preprocessed_cc['is_grey'] = data_not_preprocessed_cc['ext_col'].apply(lambda x: 1 if x == 'Grey' else 0)

# Dummy of brand
dummy_placeholder = pd.get_dummies(data_not_preprocessed_cc['brand'])
data_not_preprocessed_cc = pd.concat([data_not_preprocessed_cc, dummy_placeholder], axis=1)
data_not_preprocessed_cc.drop('brand', axis=1, inplace=True)

# Convert True and False to 1 and 0 for every column
for column in data_not_preprocessed_cc.columns:
    if data_not_preprocessed_cc[column].dtype == 'bool':
        data_not_preprocessed_cc[column] = data_not_preprocessed_cc[column].apply(lambda x: 1 if x == True else 0)

# Convert clean_title to 1 and 0
data_not_preprocessed_cc['clean_title'] = data_not_preprocessed_cc['clean_title'].apply(lambda x: 1 if x == 'Yes' else 0)

# Drop transmission, engine, accident, model
data_not_preprocessed_cc.drop(['transmission', 'engine', 'accident', 'model', 'ext_col', 'int_col'], axis=1, inplace=True)

# Modelyear in age
data_not_preprocessed_cc['age'] = 2024 - data_not_preprocessed_cc['model_year']

# Drop model_year
data_not_preprocessed_cc.drop('model_year', axis=1, inplace=True)

# convert milage to int
data_not_preprocessed_cc['milage'] = data_not_preprocessed_cc['milage'].str.replace(' mi.', '')
data_not_preprocessed_cc['milage'] = data_not_preprocessed_cc['milage'].str.replace(',', '')
data_not_preprocessed_cc['milage'] = data_not_preprocessed_cc['milage'].astype(int)

data_not_preprocessed_cc.head()

Unnamed: 0,milage,clean_title,price,Diesel,E85 Flex Fuel,Gasoline,Hybrid,Plug-In Hybrid,is_black,is_white,...,Saab,Saturn,Scion,Subaru,Suzuki,Toyota,Volkswagen,Volvo,smart,age
0,194277,1,2300,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24
1,13300,1,62500,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,9
3,67072,1,38500,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,99000,1,5000,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8
7,146700,1,24999,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,10


### Statsmodels of complete case

In [111]:
data = data_not_preprocessed_cc.copy()

# Drop all rows with price over 120000
data = data[data['price'] < 100000]

# Drop all constant columns
data = data.loc[:, (data != data.iloc[0]).any()]

# Drop this columns, because of leave one out: Gasoline
data.drop(['Gasoline'], axis=1, inplace=True)
y = data['price']
X = data.drop('price', axis=1)

model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.601
Model:,OLS,Adj. R-squared:,0.592
Method:,Least Squares,F-statistic:,66.83
Date:,"Sat, 30 Nov 2024",Prob (F-statistic):,0.0
Time:,19:51:38,Log-Likelihood:,-27209.0
No. Observations:,2495,AIC:,54530.0
Df Residuals:,2439,BIC:,54860.0
Df Model:,55,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
milage,-0.1648,0.007,-24.985,0.000,-0.178,-0.152
Diesel,1.565e+04,1580.940,9.896,0.000,1.25e+04,1.87e+04
E85 Flex Fuel,-3134.1690,1446.859,-2.166,0.030,-5971.368,-296.970
Hybrid,5274.8001,1367.645,3.857,0.000,2592.933,7956.667
Plug-In Hybrid,2507.7876,2613.709,0.959,0.337,-2617.531,7633.107
is_black,92.1594,670.121,0.138,0.891,-1221.906,1406.225
is_white,-659.6806,681.424,-0.968,0.333,-1995.910,676.549
Acura,5.204e+04,2254.491,23.081,0.000,4.76e+04,5.65e+04
Alfa,4.702e+04,4232.956,11.108,0.000,3.87e+04,5.53e+04

0,1,2,3
Omnibus:,486.535,Durbin-Watson:,1.977
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1074.805
Skew:,1.109,Prob(JB):,4.06e-234
Kurtosis:,5.329,Cond. No.,4650000.0


In [112]:
# All significant variables
summary = model.summary()
summary = summary.tables[1]
summary = pd.DataFrame(summary.data)
summary.columns = summary.iloc[0]
summary = summary[1:]
summary.sort_values('P>|t|', inplace=True)
# coefs should be converted to float without e-notation
summary['coef'] = summary['coef'].astype(float).apply(lambda x: '{:.2f}'.format(x))

pd.set_option('display.max_rows', None)
summary.head(100)

Unnamed: 0,Unnamed: 1,coef,std err,t,P>|t|,[0.025,0.975]
1,milage,-0.16,0.007,-24.985,0.0,-0.178,-0.152
31,Lexus,57290.0,1466.528,39.063,0.0,54400.0,60200.0
32,Lincoln,50580.0,2330.482,21.705,0.0,46000.0,55200.0
33,Lotus,75060.0,7730.762,9.709,0.0,59900.0,90200.0
34,MINI,39630.0,2723.479,14.552,0.0,34300.0,45000.0
35,Maserati,58460.0,2828.657,20.666,0.0,52900.0,64000.0
36,Maybach,94210.0,13400.0,7.045,0.0,68000.0,120000.0
37,Mazda,41620.0,2019.764,20.606,0.0,37700.0,45600.0
38,Mercedes-Benz,57730.0,1128.956,51.138,0.0,55500.0,59900.0
39,Mercury,58080.0,7781.874,7.464,0.0,42800.0,73300.0


#### Statsmodels of preprocessed data

In [118]:
data = data_preprocessed.copy()

# Drop all rows with price over 120000
data = data[data['price'] < 100000]

# Drop all constant columns
data = data.loc[:, (data != data.iloc[0]).any()]

# Modelyear in age
data['age'] = 2024 - data['model_year']
data.drop('model_year', axis=1, inplace=True)

y = data['price']
X = data.drop('price', axis=1)

model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.905
Model:,OLS,Adj. R-squared (uncentered):,0.904
Method:,Least Squares,F-statistic:,3573.0
Date:,"Sat, 30 Nov 2024",Prob (F-statistic):,0.0
Time:,19:58:39,Log-Likelihood:,-32786.0
No. Observations:,3024,AIC:,65590.0
Df Residuals:,3016,BIC:,65640.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
milage,-0.1201,0.006,-21.324,0.000,-0.131,-0.109
accident,-1058.9007,514.427,-2.058,0.040,-2067.563,-50.238
clean_title,6818.1986,620.799,10.983,0.000,5600.966,8035.431
horsepower,123.6834,1.466,84.365,0.000,120.809,126.558
turbo,6318.0933,953.359,6.627,0.000,4448.793,8187.394
is_luxury,3730.5404,2973.787,1.254,0.210,-2100.315,9561.396
is_upper_class,3761.1374,747.131,5.034,0.000,2296.200,5226.075
age,-672.6747,47.114,-14.278,0.000,-765.053,-580.296

0,1,2,3
Omnibus:,258.67,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,437.058
Skew:,0.62,Prob(JB):,1.24e-95
Kurtosis:,4.39,Cond. No.,1130000.0
