In [4]:
#importing the important libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

In [5]:
#reading the data
df = pd.read_csv("model_data.csv", sep=";")
df.head()

Unnamed: 0,product_tier,make_name,price,first_zip_digit,first_registration_year,search_views,detail_views,stock_days,ctr,created_month,deleted_month
0,Basic,Mitsubishi,16750,5,2013,3091,123,31,0.037803,7,8
1,Basic,Mercedes-Benz,35950,4,2015,3283,223,52,0.067926,8,10
2,Basic,Mercedes-Benz,11950,3,1998,3247,265,51,0.081614,7,9
3,Basic,Ford,1750,6,2003,1856,26,101,0.014009,7,10
4,Basic,Mercedes-Benz,26500,3,2014,490,20,11,0.040816,8,9


To get the data ready for modelling we need to:

- Drop the "search_views" column to avoid multi-collinearity.

- Get numerical values for the categorical data (one hot encoding).

- Get a scaled copy of the data.

- Split the dataset into training and testing set and seperate the features from the target value. 

In [6]:
#dropping the search views and the duplicates
df.drop("search_views", axis=1, inplace=True)

df.drop_duplicates(inplace=True)

In [7]:
df.duplicated().sum()

0

In [8]:
#splitting features and target values
X = df.iloc[:, 1:]

y = df.iloc[:, 0]

In [9]:
X.head()

Unnamed: 0,make_name,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month
0,Mitsubishi,16750,5,2013,123,31,0.037803,7,8
1,Mercedes-Benz,35950,4,2015,223,52,0.067926,8,10
2,Mercedes-Benz,11950,3,1998,265,51,0.081614,7,9
3,Ford,1750,6,2003,26,101,0.014009,7,10
4,Mercedes-Benz,26500,3,2014,20,11,0.040816,8,9


In [10]:
y.head()

0    Basic
1    Basic
2    Basic
3    Basic
4    Basic
Name: product_tier, dtype: object

In [11]:
X.make_name.nunique()

91

- The column "make_name" has so many categories that if we on hot encoded them, we shall increase the dimensionality drastically.

- I will choose only the top 20 classes and one hot encode them, and give 0 for the rest of the classes.

In [12]:
#finding the top 20 classes
top20 = X.make_name.value_counts(ascending=False).head(20).index

In [13]:
#assigning the other classes to the value "n"
X.make_name = X.make_name.apply(lambda x: x.replace(x,"n") if x not in top20 else x)

In [14]:
X.make_name.value_counts(ascending=False)

Volkswagen       9417
n                7132
Renault          6929
Peugeot          5444
Opel             5215
Ford             5196
Mercedes-Benz    4757
BMW              4551
Volvo            3992
Toyota           3924
Audi             3311
Citroen          3017
Fiat             2586
Nissan           2015
Kia              1931
Hyundai          1802
SEAT             1772
Skoda            1515
Suzuki           1312
MINI             1234
Mazda            1198
Name: make_name, dtype: int64

In [15]:
X.head()

Unnamed: 0,make_name,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month
0,n,16750,5,2013,123,31,0.037803,7,8
1,Mercedes-Benz,35950,4,2015,223,52,0.067926,8,10
2,Mercedes-Benz,11950,3,1998,265,51,0.081614,7,9
3,Ford,1750,6,2003,26,101,0.014009,7,10
4,Mercedes-Benz,26500,3,2014,20,11,0.040816,8,9


In [16]:
#getting dummy variables for the categorical values
X = pd.get_dummies(X, drop_first=True)

In [17]:
X.head()

Unnamed: 0,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month,make_name_BMW,make_name_Citroen,...,make_name_Opel,make_name_Peugeot,make_name_Renault,make_name_SEAT,make_name_Skoda,make_name_Suzuki,make_name_Toyota,make_name_Volkswagen,make_name_Volvo,make_name_n
0,16750,5,2013,123,31,0.037803,7,8,0,0,...,0,0,0,0,0,0,0,0,0,1
1,35950,4,2015,223,52,0.067926,8,10,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11950,3,1998,265,51,0.081614,7,9,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1750,6,2003,26,101,0.014009,7,10,0,0,...,0,0,0,0,0,0,0,0,0,0
4,26500,3,2014,20,11,0.040816,8,9,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
#removing the make_name part from the column names
X.columns = X.columns.str.replace('make_name_', '')

In [19]:
X.columns

Index(['price', 'first_zip_digit', 'first_registration_year', 'detail_views',
       'stock_days', 'ctr', 'created_month', 'deleted_month', 'BMW', 'Citroen',
       'Fiat', 'Ford', 'Hyundai', 'Kia', 'MINI', 'Mazda', 'Mercedes-Benz',
       'Nissan', 'Opel', 'Peugeot', 'Renault', 'SEAT', 'Skoda', 'Suzuki',
       'Toyota', 'Volkswagen', 'Volvo', 'n'],
      dtype='object')

Because of the imbalance in the data, I will perform a stratified splitting to keep the distribution of the target variable in all the splits. Then, I will try oversampling techniques to boost the model's performance.

In [20]:
#Data splitting 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [21]:
X_train

Unnamed: 0,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month,BMW,Citroen,...,Opel,Peugeot,Renault,SEAT,Skoda,Suzuki,Toyota,Volkswagen,Volvo,n
50435,5495,2,2008,1332,74,0.048487,11,2,0,0,...,0,0,0,0,0,0,0,0,1,0
39217,5000,3,2008,3,0,0.046154,8,8,0,0,...,0,0,0,0,0,0,0,0,0,0
11704,13488,7,2016,31,9,0.038462,11,11,0,0,...,0,0,1,0,0,0,0,0,0,0
65578,28490,1,2017,73,44,0.065354,9,10,0,0,...,0,0,0,0,0,0,0,0,0,0
40324,4850,2,2005,60,18,0.045968,9,9,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4537,9500,7,2010,44,7,0.083491,9,9,0,0,...,0,0,0,0,0,0,0,1,0,0
57547,13444,2,2013,264,6,0.054122,11,12,0,0,...,1,0,0,0,0,0,0,0,0,0
27469,18997,6,2017,28,22,0.068796,9,10,0,0,...,1,0,0,0,0,0,0,0,0,0
57738,1995,6,2005,54,45,0.029917,11,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [22]:
X_test

Unnamed: 0,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month,BMW,Citroen,...,Opel,Peugeot,Renault,SEAT,Skoda,Suzuki,Toyota,Volkswagen,Volvo,n
71956,6950,3,2015,61,14,0.102694,9,9,0,0,...,0,1,0,0,0,0,0,0,0,0
15465,22450,3,2016,37,49,0.030731,7,9,0,0,...,0,0,0,0,0,0,0,0,0,1
42949,3425,3,2004,43,9,0.090717,9,9,0,0,...,0,0,0,1,0,0,0,0,0,0
17447,44900,8,2010,434,45,0.037295,11,12,0,0,...,0,0,0,0,0,0,0,0,0,0
64678,8250,1,2011,8,39,0.013817,7,8,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40815,4490,6,2006,124,57,0.034792,7,9,0,0,...,0,0,0,0,0,0,0,0,0,0
73358,7725,3,2013,301,89,0.061055,11,2,0,0,...,0,0,0,0,0,0,1,0,0,0
5580,30000,6,2016,92,18,0.087287,9,9,0,0,...,0,0,0,0,0,0,0,0,0,0
28339,2950,4,2005,3,2,0.035185,9,9,0,0,...,0,1,0,0,0,0,0,0,0,0


In [23]:
#getting copies of the data to be scaled 
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

In [24]:
#feature scaling 

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

#features to be normalized
sc = ['price', 'first_registration_year', 'detail_views',
       'stock_days', 'ctr']

X_train_scaled[sc] = scaler.fit_transform(X_train[sc])
X_test_scaled[sc] = scaler.transform(X_test[sc])

In [25]:
X_train_scaled

Unnamed: 0,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month,BMW,Citroen,...,Opel,Peugeot,Renault,SEAT,Skoda,Suzuki,Toyota,Volkswagen,Volvo,n
50435,-0.590182,2,-0.472990,5.479198,1.179099,-0.020271,11,2,0,0,...,0,0,0,0,0,0,0,0,1,0
39217,-0.620917,3,-0.472990,-0.399342,-1.118702,-0.023188,8,8,0,0,...,0,0,0,0,0,0,0,0,0,0
11704,-0.093893,7,0.751916,-0.275490,-0.839240,-0.032805,11,11,0,0,...,0,0,1,0,0,0,0,0,0,0
65578,0.837588,1,0.905029,-0.089712,0.247558,0.000815,9,10,0,0,...,0,0,0,0,0,0,0,0,0,0
40324,-0.630230,2,-0.932329,-0.147215,-0.559778,-0.023421,9,9,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4537,-0.341509,7,-0.166763,-0.217988,-0.901343,0.023490,9,9,0,0,...,0,0,0,0,0,0,0,1,0,0
57547,-0.096625,2,0.292576,0.755134,-0.932394,-0.013227,11,12,0,0,...,1,0,0,0,0,0,0,0,0,0
27469,0.248164,6,0.905029,-0.288760,-0.435572,0.005118,9,10,0,0,...,1,0,0,0,0,0,0,0,0,0
57738,-0.807498,6,-0.932329,-0.173755,0.278609,-0.043487,11,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [26]:
X_test_scaled

Unnamed: 0,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month,BMW,Citroen,...,Opel,Peugeot,Renault,SEAT,Skoda,Suzuki,Toyota,Volkswagen,Volvo,n
71956,-0.499840,3,0.598803,-0.142792,-0.683983,0.047496,9,9,0,0,...,0,1,0,0,0,0,0,0,0,0
15465,0.462562,3,0.751916,-0.248951,0.402815,-0.042470,7,9,0,0,...,0,0,0,0,0,0,0,0,0,1
42949,-0.718709,3,-1.085443,-0.222411,-0.839240,0.032524,9,9,0,0,...,0,0,0,1,0,0,0,0,0,0
17447,1.856493,8,-0.166763,1.507092,0.278609,-0.034263,11,12,0,0,...,0,0,0,0,0,0,0,0,0,0
64678,-0.419123,1,-0.013650,-0.377226,0.092301,-0.063615,7,8,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40815,-0.652583,6,-0.779216,0.135875,0.651226,-0.037392,7,9,0,0,...,0,0,0,0,0,0,0,0,0,0
73358,-0.451720,3,0.292576,0.918796,1.644870,-0.004559,11,2,0,0,...,0,0,0,0,0,0,1,0,0,0
5580,0.931345,6,0.751916,-0.005670,-0.559778,0.028235,9,9,0,0,...,0,0,0,0,0,0,0,0,0,0
28339,-0.748202,4,-0.932329,-0.399342,-1.056600,-0.036901,9,9,0,0,...,0,1,0,0,0,0,0,0,0,0


# Classification models 


- I will try different classification models to solve the first problem (Is it possible to predict the product tier from the information given in the other columns?)

- Due to the imbalance in the data, I performed a stratified splitting to keep the distribution the same and if it didn't work I will use other techinques to boost the model's performance.

In [27]:
#filtering out the warning messages
import warnings
warnings.filterwarnings('ignore')

In [28]:
#logistic regression base model 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logit = LogisticRegression()

#getting the cross val score
np.mean(cross_val_score(logit, X_train_scaled, y_train, scoring="f1_macro"))

0.35395451682412293

- Due to the imbalance in the data, I had to use the f1_score metric and it seems that the model is finding it hard.

- I will fit the model and try to predict the training data to see if the model is able to do it.

In [29]:
#fitting the model 
logit.fit(X_train_scaled, y_train)

#making a prediction
y_pred_train = logit.predict(X_train_scaled)

In [30]:
#using confusion matrix and classification report for evaluation
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_train, y_pred_train)

print(cm)
print(classification_report(y_train, y_pred_train, digits=3))

[[56397     0   115]
 [  429     0     3]
 [ 1671     0    72]]
              precision    recall  f1-score   support

       Basic      0.964     0.998     0.981     56512
        Plus      0.000     0.000     0.000       432
     Premium      0.379     0.041     0.074      1743

    accuracy                          0.962     58687
   macro avg      0.448     0.346     0.352     58687
weighted avg      0.940     0.962     0.947     58687



- Now it is clear that the model is finding no problem in predicting the Basic class with good f1 score, given that class dominates over 96% of the data.

- But when it comes to the less represented classes the model is doing really bad.

- Now I will try a Tree based model (RandomForestCalssifier) to see if it can perform better.

In [31]:
#trying a base model 
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
np.mean(cross_val_score(rfc, X_train, y_train, scoring="f1_macro"))

0.4639003441408115

- So the base random forest model is doing slightly better, but still getting very poor f1_score

- I will use grid search to find better parameters for the model

In [40]:
# number of trees
n_estimators = [int(i) for i in np.linspace(10, 80, num=10)]

#number of features
max_features = ['auto', 'sqrt']

#tree depth
max_depth = [2, 4]

#min samples per split
min_samples_split = [2, 5]

#min samples per leaf
min_samples_leaf = [1, 2]

#bootstrapping
bootstrap = [True, False]

#parameters_grid
p_grid = {'n_estimators': n_estimators, 'max_features': max_features,'max_depth': max_depth,
          'min_samples_split': min_samples_split,'min_samples_leaf': min_samples_leaf,
          'bootstrap': bootstrap}

In [41]:
#grid search 
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator= rfc, param_grid=p_grid, cv=3, verbose=2, n_jobs=4)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 4],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72,
                                          80]},
             verbose=2)

In [42]:
#the best parameters
grid.best_params_

{'bootstrap': True,
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [46]:
#trying out the model again with the best paramaters
rfc = RandomForestClassifier(n_estimators=10, min_samples_split=2, 
                             bootstrap=True, max_depth=4, 
                             max_features='sqrt', min_samples_leaf=1)
                                
np.mean(cross_val_score(rfc, X_train, y_train, scoring="f1_macro"))

0.3270398759134675

- Even the tree based model with the best paramerters is still getting a very poor f1_score. 

- So it looks like it is pretty hard to predict the product tier with the info given in the other columns, especially with the imbalance in the data. 

- That's why I will try other techniques to handle the baise in the data to be sure.

In [2]:
#using the imblearn library to handle the imbalance
#installing the library 

!pip install imblearn

Collecting imblearn
  Using cached imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.1-py3-none-any.whl (189 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.8.1 imblearn-0.0


You should consider upgrading via the 'c:\users\moham\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [3]:

# check version number
import imblearn
print(imblearn.__version__)

0.8.1
