In [3]:
import numpy as np
import glob
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler


In [4]:
path = r'D:\RiceBootCamp\Homework\Oct 22\Project3\Data' # use your path
all_files = glob.glob(path + "/*.csv")

li = []
brands = ["Audi","BMW","Ford","Hyundi","Mercedes Benz","Skoda","Toyota","Volkswagen"]

for filename, brand in zip(all_files, brands):
    frame = pd.read_csv(filename, index_col=None, header=0)
    frame["make"] = brand
    li.append(frame)
    
df = pd.concat(li, axis=0, ignore_index=True)
df


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,Volkswagen


In [5]:
df['year'] = df['year'].astype(str)

In [6]:
# Assign X (data) and y (target)
X = df.drop("price", axis=1)
y = df["price"]
print(X.shape, y.shape)

(85555, 9) (85555,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,  random_state=42)

In [8]:
cat_cols = X.columns[X.dtypes == 'O']
num_cols = X.columns[X.dtypes == 'float64']

In [9]:
cat_cols

Index(['model', 'year', 'transmission', 'fuelType', 'make'], dtype='object')

In [10]:
categories = [
    X[column].unique() for column in X[cat_cols]]

for cat in categories:
    cat[cat == None] = 'missing'  # noqa

In [11]:
categories

[array([' A1', ' A6', ' A4', ' A3', ' Q3', ' Q5', ' A5', ' S4', ' Q2',
        ' A7', ' TT', ' Q7', ' RS6', ' RS3', ' A8', ' Q8', ' RS4', ' RS5',
        ' R8', ' SQ5', ' S8', ' SQ7', ' S3', ' S5', ' A2', ' RS7',
        ' 5 Series', ' 6 Series', ' 1 Series', ' 7 Series', ' 2 Series',
        ' 4 Series', ' X3', ' 3 Series', ' X5', ' X4', ' i3', ' X1', ' M4',
        ' X2', ' X6', ' 8 Series', ' Z4', ' X7', ' M5', ' i8', ' M2',
        ' M3', ' M6', ' Z3', ' Fiesta', ' Focus', ' Puma', ' Kuga',
        ' EcoSport', ' C-MAX', ' Mondeo', ' Ka+', ' Tourneo Custom',
        ' S-MAX', ' B-MAX', ' Edge', ' Tourneo Connect', ' Grand C-MAX',
        ' KA', ' Galaxy', ' Mustang', ' Grand Tourneo Connect', ' Fusion',
        ' Ranger', ' Streetka', ' Escort', ' Transit Tourneo', ' I20',
        ' Tucson', ' I10', ' IX35', ' I30', ' I40', ' Ioniq', ' Kona',
        ' Veloster', ' I800', ' IX20', ' Santa Fe', ' Accent', ' Terracan',
        ' Getz', ' Amica', ' SLK', ' S Class', ' SL CLASS', ' G C

In [12]:
# transformation to use for non-linear estimators
cat_proc_nlin = make_pipeline(
    SimpleImputer(missing_values=None, strategy='constant',
                  fill_value='missing'),
    OrdinalEncoder(categories=categories)
    )

num_proc_nlin = make_pipeline(SimpleImputer(strategy='mean'))

processor_nlin = make_column_transformer(
    (cat_proc_nlin, cat_cols),
    (num_proc_nlin, num_cols),
    remainder='passthrough')


In [13]:
cat_proc_lin = make_pipeline(
    SimpleImputer(missing_values=None,
                  strategy='constant',
                  fill_value='missing'),
    OneHotEncoder(categories=categories)
)

num_proc_lin = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)
# transformation to use for linear estimators
processor_lin = make_column_transformer(
    (cat_proc_lin, cat_cols),
    (num_proc_lin, num_cols),
    remainder='passthrough')

In [14]:
del df, X,y, cat_cols, num_cols, categories

In [15]:
rf = Pipeline([
    ('preprocess', processor_lin),
    ('regressor', RandomForestRegressor(random_state=42))
])
rf.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 missing_values=None,
                                                                                 strategy='constant')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(categories=[array([' A1', ' A6', ' A4', ' A3', ' Q3', ' Q5', ' A5', ' S4', ' Q2',
       ' A7', ' TT', ' Q7', ' RS6', ' RS3', ' A8', ' Q8', ' RS4', ' R...
                                                                                             array(['Audi', 'BMW', 'Ford', 'Hyundi',

In [16]:
rf.score(X_test,y_test)

0.9478601231696431

In [19]:
encoded_predictions = rf.predict(X_test[:5])

In [20]:
print(f"Predicted classes: {encoded_predictions}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: [13129.66       10262.42       20025.83       14614.47866667
 17736.475     ]
Actual Labels: [14644, 10095, 23025, 14299, 16995]


In [22]:
pkl_filename = "RandomForest_model_OneHot-PipeLine.pickle"
with open(pkl_filename, 'wb') as file:
    pickle.dump(rf, file)