In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import dill

In [2]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

class FeatureDrop(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.drop([self.key], axis=1)

In [2]:
X = pd.read_csv('X_train.csv')
y = pd.read_csv('y_train.csv')
#df = X.merge(y, left_on='carID', right_on='carID')

In [20]:
X.head()

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,13207,hyundi,Santa Fe,2019,Semi-Auto,4223,Diesel,145.0,39.8,2.2
1,17314,vauxhall,GTC,2015,Manual,47870,Diesel,125.0,60.1,2.0
2,12342,audi,RS4,2019,Automatic,5151,Petrol,145.0,29.1,2.9
3,13426,vw,Scirocco,2016,Automatic,20423,Diesel,30.0,57.6,2.0
4,16004,skoda,Scala,2020,Semi-Auto,3569,Petrol,145.0,47.1,1.0


In [3]:
X['brand'].unique()

array(['hyundi', 'vauxhall', 'audi', 'vw', 'skoda', 'merc', 'toyota',
       'bmw', 'ford'], dtype=object)

In [4]:
grouped_models = X.groupby('brand')['model'].apply(set)

In [5]:
grouped_models

brand
audi        { SQ5,  S3,  S4,  A8,  RS5,  R8,  RS6,  Q8,  S...
bmw         { X4,  i3,  M4,  M2,  8 Series,  i8,  7 Series...
ford        { Tourneo Custom,  Mustang,  Grand C-MAX,  Pum...
hyundi          { IX20,  Santa Fe,  IX35,  I800,  Getz,  I40}
merc        { G Class,  GLB Class,  SLK,  CLK,  M Class,  ...
skoda              { Rapid,  Scala,  Kamiq,  Yeti,  Roomster}
toyota      { IQ,  Avensis,  Land Cruiser,  Verso,  GT86, ...
vauxhall    { Zafira Tourer,  Antara,  GTC,  Vivaro,  Agil...
vw          { CC,  Tiguan Allspace,  Shuttle,  Caravelle, ...
Name: model, dtype: object

In [24]:
X['engineSize'].min(), X['engineSize'].max()

(0.0, 6.6)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
X_train.head(3)

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
3790,18677,vw,Arteon,2020,Semi-Auto,2000,Diesel,145.0,53.3,2.0
156,17341,merc,V Class,2018,Manual,30577,Diesel,145.0,46.3,2.1
4116,15968,merc,V Class,2019,Semi-Auto,13347,Diesel,145.0,45.6,2.2


In [6]:
categorical_columns = ['brand', 'model', 'transmission', 'fuelType']
continuous_columns = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
drop_features = ['carID']

In [7]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_columns),
        ('cat', categorical_transformer, categorical_columns),
    ])

In [8]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('forest', RandomForestRegressor(random_state=42)),
])  
model.fit(X_train, y_train['price'])
preds = model.predict(X_test)
r2_score(y_test['price'], preds)

0.9573736259594715

In [11]:
results = y_test.copy(deep=True)
results['preds'] = preds
results = results.sort_values('price')

plt.plot(results['preds'].values)
plt.plot(results['price'].values)
plt.legend(['preds', 'true'])
plt.show()

In [12]:
with open("model_pipeline.dill", "wb") as f:
    dill.dump(model, f)

In [13]:
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# max_depth = [int(x) for x in np.linspace(10, 110, num = 3)]
# max_depth.append(None)
# model_name = 'forest'
# random_grid = {f'{model_name}__n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 3)],
#                f'{model_name}__max_depth': max_depth,
#                f'{model_name}__min_samples_split': [2, 5],
#                f'{model_name}__min_samples_leaf': [1, 2]}

# print(random_grid)

In [14]:
# rf_random = GridSearchCV(estimator = model, param_grid = random_grid, cv = 3, verbose=2, n_jobs = -1, scoring='r2')
# rf_random.fit(X_train, y_train)

In [15]:
# rf_random.best_params_