In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV
from sklearn.impute import SimpleImputer


Importing the Data

In [25]:
car_df = pd.read_excel("./Data/train.xlsx")

In [27]:
#might reverse the removal of model variable
car_df.drop(columns=["id", "clean_title"])
car_df.rename(columns={"milage":"mileage"}, inplace=True)
# the number 2 FOR SOME REASON was found in the transmission column IN A COLUMN OF STRINGS 
car_df["transmission"] = car_df["transmission"].replace(2, "2T")

Obtaining a List of All Categorical and Quantitative Independent Variables

In [28]:
numerical_col = {"model_year", "mileage"}
categorical_col = set(car_df.columns).difference(numerical_col.union({"price"}))

In [29]:
car_df[list(categorical_col)].nunique(dropna=True)

engine            1117
ext_col            319
fuel_type            7
transmission        52
brand               57
clean_title          1
accident             2
model             1897
int_col            156
id              188533
dtype: int64

In [14]:
car_df.head()

Unnamed: 0,brand,model,model_year,mileage,fuel_type,engine,transmission,ext_col,int_col,accident,price
0,Chevrolet,Others,2019,61341,Gasoline,Others,8-Speed Automatic,Others,Black,None reported,18853
1,Chevrolet,Others,2019,53607,Gasoline,Others,9-Speed Automatic,Others,Black,At least 1 accident or damage reported,39853
2,Honda,Others,2021,15636,Gasoline,Others,6-Speed Automatic,Others,Black,None reported,15363
3,Chevrolet,Camaro 1SS,2020,30630,Gasoline,Others,8-Speed Automatic,White,Black,None reported,39687
4,Kia,Others,2021,1930,Gasoline,Others,Automatic CVT,Others,Black,None reported,16499


Training and Evaluating Models

Splitting data to training and test sets

In [30]:
# seperating the values that must be trained from the rest of the database
y_df = car_df["price"]
x_df = car_df.drop(columns = ["price"])
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.3, random_state=56)

Training Models

In [31]:
#does all pre processing steps like standardizing and one hot encoding
col_transform = ColumnTransformer(transformers=[("scale", StandardScaler(), list(numerical_col)), 
                                                ("get dummies", OneHotEncoder(), list(categorical_col))])

In [32]:
lasso_md = Pipeline(steps=[("pre process", col_transform), ("lasso", Lasso())])
lasso_md.fit(x_train, y_train)

In [21]:
grad_boost_md = Pipeline(steps=[("pre_process", col_transform), 
                                ("grad_boost", GradientBoostingRegressor())])

grad_boost_md.fit(x_train, y_train)


In [24]:
#takes too long, don't run
rand_forest_md = Pipeline(steps=[("pre process", col_transform), ("rmd", RandomForestRegressor())])
rand_forest_md.fit(x_train, y_train)

KeyboardInterrupt: 

Testing the Models

In [22]:
#testing lasso regression
def test_lasso():
    lasso_pred = lasso_md.predict(x_test)

    return mean_squared_error(y_true=y_test, y_pred=lasso_pred), r2_score(y_true=y_test, y_pred=lasso_pred)

print(test_lasso())


(np.float64(5825990108.954427), 0.11461571094723799)


In [23]:
#testing gradient boost
def test_gradient_boost():
    grad_pred = grad_boost_md.predict(x_test)

    return mean_squared_error(y_true=y_test, y_pred=grad_pred), r2_score(y_true=y_test, y_pred=grad_pred)

print(test_gradient_boost())


(np.float64(5782783908.688419), 0.1211818207740285)


In [15]:
#testing support vector machines
def test_random_forest():
    rand_pred = rand_forest_md.predict(x_test)
    return mean_squared_error(y_true=y_test, y_pred=rand_pred), r2_score(y_true=y_test, y_pred=rand_pred)

print(test_random_forest())

NameError: name 'svr_md' is not defined