In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys

In [3]:
# Check python version.
# This code should run for python version >=3.6

print("python", sys.version)

python 3.9.13 | packaged by conda-forge | (main, May 27 2022, 17:00:33) 
[Clang 13.0.1 ]


In [15]:
vehicles = pd.read_csv("vehicles.csv", index_col=0)

In [16]:
reduced_df = vehicles.copy()
reduced_df = reduced_df[reduced_df['price'] > 0]
reduced_df = reduced_df[reduced_df['price'].notna()]
reduced_df = reduced_df[reduced_df['price'] < 1000000]
reduced_df = reduced_df[~((reduced_df.manufacturer.isnull()) & (reduced_df.manufacturer.isnull()))]
reduced_df = reduced_df.drop(columns=[
    'county', 
    'id', 
    'region_url', 
    'url', 
    'image_url',
    'VIN'
])

In [17]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(reduced_df, test_size=0.1, random_state=123)

In [18]:
def preprocess(df_in):
    
    df = df_in.copy()
    
    N = 50
    topN_models = df.model.value_counts().head(N).index.to_numpy()
    df.loc[~df['model'].isin(topN_models),'model'] = 'unknown'
    top_models = list(train_set.model.unique())
    
    df = df.drop(columns=[
        'size', 
        'drive', 
        'lat', 
        'long', 
        'posting_date', 
        'paint_color', 
        'description',
        'state',
        'region'
    ])
    
    df.loc[~df['model'].isin(top_models),'model'] = 'unknown'
    
    df = df.dropna(subset=['year', 'odometer','manufacturer'])
    df['type'].fillna('unknown',inplace=True)
    df['title_status'].fillna('clean', inplace=True)
    df['fuel'].fillna('gas', inplace=True)
    df['cylinders'].fillna('unknown', inplace=True)
    df['transmission'].fillna('automatic', inplace=True)
    df['condition'].fillna('good', inplace=True)
    
    
    X = df.drop('price', axis=1)
    y = df['price'].copy()
    
    return(X,y)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

X, y = preprocess(train_set)

num_attribs = list(X.select_dtypes('number'))
cat_attribs = list(X.select_dtypes('object'))


pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

X_prepared = pipeline.fit_transform(X)

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

# initializing the model
lin_model = LinearRegression()
# fitting on training data
lin_model.fit(X_prepared, y)
# getting the predictions for the y_train
y_train_pred = lin_model.predict(X_prepared)

mse = mean_squared_error(y, y_train_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_train_pred)
mape = mean_absolute_percentage_error(y, y_train_pred)

print("Mean squared error ", mse)
print("Root mean squared error ", rmse)
print("Mean absolute error ", mae)
print("Mean absolute percentage error", mape)


Mean squared error  138113434.15391302
Root mean squared error  11752.167210940841
Mean absolute error  7609.628529974464
Mean absolute percentage error 87.6946042693848


**Task:** Train a `sklearn.tree.DecisionTreeRegressor` model. First train it on the full training set and then try 3-fold cross validation.

In [None]:
from sklearn.tree import DecisionTreeRegressor


dt_model = DecisionTreeRegressor(max_depth=2, random_state=42)
dt_model.fit(X_prepared, y)
y_train_pred = dt_model.predict(X_prepared)
print("Mean squared error ", mean_squared_error(y, y_train_pred))

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
cross_validate(dt_model, X_prepared, y, cv=3, scoring="neg_mean_squared_error")
cross_val_score(dt_model, X_prepared, y, cv=3, scoring="neg_mean_squared_error")

In [22]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

params = {'n_estimators':[5, 10], 'max_features': ["sqrt", "log2"]}
grid_search_cv = GridSearchCV(RandomForestRegressor(), params, verbose=1, cv=3)

grid_search_cv.fit(X_prepared, y)
print(grid_search_cv.best_estimator_)
print(grid_search_cv.best_score_)
# output
# Fitting 3 folds for each of 4 candidates, totalling 12 fits
# CPU times: user 12min 12s, sys: 1.81 s, total: 12min 14s
# Wall time: 12min 14s
# GridSearchCV(cv=3, estimator=RandomForestRegressor(),
#              param_grid={'max_features': ['sqrt', 'log2'],
#                          'n_estimators': [5, 10]},
#              verbose=1)
# Best estimator: RandomForestRegressor(max_features='sqrt', n_estimators=10)
# Best score: 0.8007140488524288

0.8007140488524288
CPU times: user 28 µs, sys: 14 µs, total: 42 µs
Wall time: 46.3 µs


In [23]:
from sklearn.model_selection import RandomizedSearchCV
params = {'n_estimators': [5, 10], 'max_features': ["sqrt", "log2"]}
rand_search_cv = RandomizedSearchCV(RandomForestRegressor(), params, verbose=1, cv=3)
rand_search_cv.fit(X_prepared, y)
rand_search_cv.best_estimator_
rand_search_cv.best_score_
# output
# Fitting 3 folds for each of 4 candidates, totalling 12 fits
# RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(),
#                    param_distributions={'max_features': ['sqrt', 'log2'],
#                                         'n_estimators': [5, 10]},
#                    verbose=1)
# Best estimator: RandomForestRegressor(max_features='log2', n_estimators=10)
# Best score: 0.7957024744754345

0.7957024744754345

In [27]:

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X_prepared)

lin_model = LinearRegression()
lin_model.fit(X_poly, y)
y_train_pred = lin_model.predict(X_poly)

mse = mean_squared_error(y, y_train_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_train_pred)
mape = mean_absolute_percentage_error(y, y_train_pred)

print("Mean squared error for poly regression", mse)
print("Root mean squared error ", rmse)
print("Mean absolute error ", mae)
print("Mean absolute percentage error", mape)

# Support vector regressor
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svm_reg = SVC()

params = {'kernel':["linear","poly","rbf"]}
grid_search_cv = GridSearchCV(SVC(), params, verbose=1, cv=3)

grid_search_cv.fit(X_prepared, y)


Mean squared error for poly regression 79824293.25779378
Root mean squared error  8934.444205309796
Mean absolute error  5000.970054046256
Mean absolute percentage error 67.56168172762246
Fitting 3 folds for each of 3 candidates, totalling 9 fits




In [3]:
msrp_df = pd.read_csv("msrp.csv")

In [4]:
for col in msrp_df.columns:
    new_col = '_'.join(col.lower().split(' '))
    msrp_df.rename({col: new_col}, inplace=True, axis=1)

In [5]:
msrp_df['is_luxury'] = msrp_df.market_category.str.contains('Luxury')

In [6]:
msrp_df = msrp_df.drop(columns=['model','market_category', 'make', 'msrp', 'popularity'])

In [8]:
from sklearn.model_selection import train_test_split
msrp_train_set, msrp_test_set = train_test_split(
    msrp_df.dropna(subset=['is_luxury']), 
    test_size=0.1, 
    random_state=123)
msrp_train_df = msrp_train_set.copy()

**Task:** Make sure you understand what the above code is doing and verify that `msrp_train_df` has the features we need.

**Task:** Get familiar with the training data, assuming now that `is_luxury` is our target variable.

In [9]:
print(msrp_train_df['is_luxury'].head(5))
# print(msrp_train_set.columns)
# print(msrp_train_set.info())
# print(msrp_train_set.describe())

print(msrp_train_df.corr())

print("="*30)
N = len(msrp_train_df.index)
print(msrp_train_df.isnull().sum()/N)

# columns with null values:
# year                 0.000000
# engine_fuel_type     0.000000
# engine_hp            0.007207
# engine_cylinders     0.003671
# transmission_type    0.000000
# driven_wheels        0.000000
# number_of_doors      0.000816
# vehicle_size         0.000000
# vehicle_style        0.000000
# highway_mpg          0.000000
# city_mpg             0.000000
# is_luxury            0.000000

cols_w_null = ['engine_hp','engine_cylinders','number_of_doors']
# printing most common values
for col in cols_w_null:
    print(msrp_train_df[col].value_counts())

# engine hp is a number so we will replace with the average
# engine_cylinders has the numbe 4.0 as the most common
# number of doors has the most common as 4.0



2876     True
4564    False
7937     True
3373    False
8774    False
Name: is_luxury, dtype: object
                      year  engine_hp  engine_cylinders  number_of_doors  \
year              1.000000   0.243878         -0.052106         0.178450   
engine_hp         0.243878   1.000000          0.812305        -0.200619   
engine_cylinders -0.052106   0.812305          1.000000        -0.182974   
number_of_doors   0.178450  -0.200619         -0.182974         1.000000   
highway_mpg       0.198245  -0.456802         -0.603422         0.103950   
city_mpg          0.157091  -0.482778         -0.591754         0.125558   

                  highway_mpg  city_mpg  
year                 0.198245  0.157091  
engine_hp           -0.456802 -0.482778  
engine_cylinders    -0.603422 -0.591754  
number_of_doors      0.103950  0.125558  
highway_mpg          1.000000  0.876425  
city_mpg             0.876425  1.000000  
year                 0.000000
engine_fuel_type     0.000000
engine_hp   

  print(msrp_train_df.corr())


**Task:** Prepare the MSRP training data for machine learning algorithms, treating `is_luxury` as the target variable.

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

def prepare_data(X):

    # replace null values or drop
    
    X['engine_hp'].fillna(msrp_train_df['engine_hp'].mean(), inplace=True)
    X['engine_cylinders'].fillna(4.0, inplace=True)
    X['number_of_doors'].fillna(4.0, inplace=True)

    X['is_luxury'] = X['is_luxury'].astype(int)
    y_ = X['is_luxury'].copy()
    X_unprepared = X.drop('is_luxury', axis=1)
    
    

    num_attribs = list(X_unprepared.select_dtypes('number'))
    cat_attribs = list(X_unprepared.select_dtypes('object'))

    print(num_attribs)
    print()
    print(cat_attribs)

    pipeline = ColumnTransformer([
    ("num", Pipeline([
        ('std_scaler', StandardScaler()),
    ]), num_attribs),
    ("cat", OneHotEncoder(), cat_attribs )
    ])

    X_ = pipeline.fit_transform(X_unprepared)
    
    

    return (X_,y_)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve
models = []

X_train, y_train= prepare_data(msrp_train_df)


l_reg = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=123).fit(X_train, y_train)
models.append(("LogisticRegression",l_reg, l_reg.predict(X_train)))
tree_clf = DecisionTreeClassifier(random_state=123).fit(X_train, y_train)
models.append(("DecisionTreeClassifier",tree_clf, tree_clf.predict(X_train)))
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42).fit(X_train, y_train)
models.append(("SGDClassifier",sgd_clf, sgd_clf.predict(X_train)))
svm_clf = SVC(kernel="linear", C=5, random_state=123).fit(X_train, y_train)
models.append(("SVM",svm_clf, svm_clf.predict(X_train)))

for model in models:
    print("MODEL: ", model[0])
    print(" * accuracy ",accuracy_score(y_train, model[2]))
    print(" * precision_score ",precision_score(y_train, model[2]))
    print(" * recall_score ",recall_score(y_train, model[2]))
    print(" * f1_score ",f1_score(y_train, model[2]))
    print(" * confusion_matrix ")
    print(confusion_matrix(y_train, model[2]))

# best from output:
# MODEL:  DecisionTreeClassifier
#  * accuracy  0.9959205874354093
#  * precision_score  0.9955767267778156
#  * recall_score  0.9942235813795447
#  * f1_score  0.9948996939816389
#  * confusion_matrix 
# [[4398   13]
#  [  17 2926]]

['year', 'engine_hp', 'engine_cylinders', 'number_of_doors', 'highway_mpg', 'city_mpg']

['engine_fuel_type', 'transmission_type', 'driven_wheels', 'vehicle_size', 'vehicle_style']
MODEL:  LogisticRegression
 * accuracy  0.8093554528147947
 * precision_score  0.7737122557726466
 * recall_score  0.7400611620795107
 * f1_score  0.756512678013199
 * confusion_matrix 
[[3774  637]
 [ 765 2178]]
MODEL:  DecisionTreeClassifier
 * accuracy  0.9959205874354093
 * precision_score  0.9955767267778156
 * recall_score  0.9942235813795447
 * f1_score  0.9948996939816389
 * confusion_matrix 
[[4398   13]
 [  17 2926]]
MODEL:  SGDClassifier
 * accuracy  0.8079956486265978
 * precision_score  0.7670038367631671
 * recall_score  0.7471967380224261
 * f1_score  0.7569707401032704
 * confusion_matrix 
[[3743  668]
 [ 744 2199]]
MODEL:  SVM
 * accuracy  0.8111231982594507
 * precision_score  0.7965648854961832
 * recall_score  0.709140332993544
 * f1_score  0.750314578464857
 * confusion_matrix 
[[3878  5

In [12]:
msrp_test_df = msrp_test_set.copy()

X_test, y_test = prepare_data(msrp_test_df)



y_test_pred = tree_clf.predict(X_test)

print("Validating on test data with Decision Tree Classifier")
print(" * accuracy ",accuracy_score(y_test, y_test_pred))
print(" * precision_score ",precision_score(y_test, y_test_pred))
print(" * recall_score ",recall_score(y_test, y_test_pred))
print(" * f1_score ",f1_score(y_test, y_test_pred))
print(" * confusion_matrix ")
print(confusion_matrix(y_test, y_test_pred))


['year', 'engine_hp', 'engine_cylinders', 'number_of_doors', 'highway_mpg', 'city_mpg']

['engine_fuel_type', 'transmission_type', 'driven_wheels', 'vehicle_size', 'vehicle_style']


ValueError: X has 40 features, but DecisionTreeClassifier is expecting 43 features as input.