In [2]:
#importing the important libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
#reading the data
df = pd.read_csv("model_data.csv", sep=";")
df.head()

Unnamed: 0,product_tier,make_name,price,first_zip_digit,first_registration_year,search_views,detail_views,stock_days,ctr,created_month,deleted_month
0,Basic,Mitsubishi,16750,5,2013,3091,123,31,0.037803,7,8
1,Basic,Mercedes-Benz,35950,4,2015,3283,223,52,0.067926,8,10
2,Basic,Mercedes-Benz,11950,3,1998,3247,265,51,0.081614,7,9
3,Basic,Ford,1750,6,2003,1856,26,101,0.014009,7,10
4,Basic,Mercedes-Benz,26500,3,2014,490,20,11,0.040816,8,9


To get the data ready for modelling we need to:

- Drop the "search_views" column to avoid multi-collinearity.

- Get numerical values for the categorical data (one hot encoding).

- Get a scaled copy of the data.

- Split the dataset into training and testing set and seperate the features from the target value. 

In [4]:
#dropping the search views and the duplicates
df.drop("search_views", axis=1, inplace=True)

df.drop_duplicates(inplace=True)

In [5]:
df.duplicated().sum()

0

In [6]:
#splitting features and target values
X = df.iloc[:, 1:]

y = df.iloc[:, 0]

In [7]:
X.head()

Unnamed: 0,make_name,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month
0,Mitsubishi,16750,5,2013,123,31,0.037803,7,8
1,Mercedes-Benz,35950,4,2015,223,52,0.067926,8,10
2,Mercedes-Benz,11950,3,1998,265,51,0.081614,7,9
3,Ford,1750,6,2003,26,101,0.014009,7,10
4,Mercedes-Benz,26500,3,2014,20,11,0.040816,8,9


In [8]:
y.head()

0    Basic
1    Basic
2    Basic
3    Basic
4    Basic
Name: product_tier, dtype: object

In [9]:
X.make_name.nunique()

91

- The column "make_name" has so many categories that if we on hot encoded them, we shall increase the dimensionality drastically.

- I will choose only the top 20 classes and one hot encode them, and give 0 for the rest of the classes.

In [10]:
#finding the top 20 classes
top20 = X.make_name.value_counts(ascending=False).head(20).index

In [11]:
#assigning the other classes to the value "n"
X.make_name = X.make_name.apply(lambda x: x.replace(x,"n") if x not in top20 else x)

In [12]:
X.make_name.value_counts(ascending=False)

Volkswagen       9417
n                7132
Renault          6929
Peugeot          5444
Opel             5215
Ford             5196
Mercedes-Benz    4757
BMW              4551
Volvo            3992
Toyota           3924
Audi             3311
Citroen          3017
Fiat             2586
Nissan           2015
Kia              1931
Hyundai          1802
SEAT             1772
Skoda            1515
Suzuki           1312
MINI             1234
Mazda            1198
Name: make_name, dtype: int64

In [13]:
X.head()

Unnamed: 0,make_name,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month
0,n,16750,5,2013,123,31,0.037803,7,8
1,Mercedes-Benz,35950,4,2015,223,52,0.067926,8,10
2,Mercedes-Benz,11950,3,1998,265,51,0.081614,7,9
3,Ford,1750,6,2003,26,101,0.014009,7,10
4,Mercedes-Benz,26500,3,2014,20,11,0.040816,8,9


In [14]:
#getting dummy variables for the categorical values
X = pd.get_dummies(X, drop_first=True)

In [15]:
X.head()

Unnamed: 0,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month,make_name_BMW,make_name_Citroen,...,make_name_Opel,make_name_Peugeot,make_name_Renault,make_name_SEAT,make_name_Skoda,make_name_Suzuki,make_name_Toyota,make_name_Volkswagen,make_name_Volvo,make_name_n
0,16750,5,2013,123,31,0.037803,7,8,0,0,...,0,0,0,0,0,0,0,0,0,1
1,35950,4,2015,223,52,0.067926,8,10,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11950,3,1998,265,51,0.081614,7,9,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1750,6,2003,26,101,0.014009,7,10,0,0,...,0,0,0,0,0,0,0,0,0,0
4,26500,3,2014,20,11,0.040816,8,9,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#removing the make_name part from the column names
X.columns = X.columns.str.replace('make_name_', '')

In [17]:
X.columns

Index(['price', 'first_zip_digit', 'first_registration_year', 'detail_views',
       'stock_days', 'ctr', 'created_month', 'deleted_month', 'BMW', 'Citroen',
       'Fiat', 'Ford', 'Hyundai', 'Kia', 'MINI', 'Mazda', 'Mercedes-Benz',
       'Nissan', 'Opel', 'Peugeot', 'Renault', 'SEAT', 'Skoda', 'Suzuki',
       'Toyota', 'Volkswagen', 'Volvo', 'n'],
      dtype='object')

Because of the imbalance in the data, I will perform a stratified splitting to keep the distribution of the target variable in all the splits. Then, I will try oversampling techniques to boost the model's performance.

In [18]:
#Data splitting 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [19]:
X_train

Unnamed: 0,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month,BMW,Citroen,...,Opel,Peugeot,Renault,SEAT,Skoda,Suzuki,Toyota,Volkswagen,Volvo,n
37407,10950,5,2013,39,40,0.042763,8,9,0,0,...,0,1,0,0,0,0,0,0,0,0
3087,1150,6,2000,143,28,0.013845,8,9,0,0,...,0,0,0,0,0,0,0,0,0,0
550,8999,1,2006,184,59,0.039570,8,10,0,0,...,0,0,0,0,0,0,0,0,0,0
31837,2250,2,2006,4,19,0.010554,9,10,0,0,...,0,0,0,0,0,0,0,0,0,0
54183,8750,6,2014,22,8,0.048946,11,11,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32093,17400,1,2014,16,3,0.082902,10,10,0,0,...,0,0,0,0,0,0,0,0,0,0
58728,16950,5,2015,53,7,0.075284,10,10,0,0,...,0,0,0,0,0,0,0,0,0,0
37818,7950,5,2014,10,4,0.048309,8,8,0,0,...,0,0,0,0,0,0,0,1,0,0
29810,5950,5,2008,7,15,0.042683,7,8,0,0,...,1,0,0,0,0,0,0,0,0,0


In [20]:
X_test

Unnamed: 0,price,first_zip_digit,first_registration_year,detail_views,stock_days,ctr,created_month,deleted_month,BMW,Citroen,...,Opel,Peugeot,Renault,SEAT,Skoda,Suzuki,Toyota,Volkswagen,Volvo,n
43976,6400,5,2006,17,123,0.015125,10,2,0,0,...,0,0,0,0,0,0,0,0,0,0
73411,11940,6,2015,21,15,0.040936,8,9,0,0,...,1,0,0,0,0,0,0,0,0,0
32727,12800,9,2017,0,23,0.000000,8,9,0,1,...,0,0,0,0,0,0,0,0,0,0
34579,19950,6,2018,4,45,0.014925,9,10,0,0,...,0,0,1,0,0,0,0,0,0,0
13702,27450,6,2017,4,7,0.038835,11,11,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60405,7950,6,2009,154,57,0.049296,9,10,0,0,...,0,0,0,0,0,0,0,0,0,1
53548,38740,1,2017,94,50,0.048429,11,12,0,0,...,0,1,0,0,0,0,0,0,0,0
33969,20495,7,2013,25,32,0.060096,11,12,0,0,...,0,0,0,0,0,0,0,0,0,0
52527,24450,1,2014,38,5,0.149020,7,7,0,0,...,0,0,0,0,0,0,0,0,0,0
