In [12]:
import pandas as pd
import numpy as np
from datetime import date

# Data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Data split
from sklearn.model_selection import train_test_split

# Data exploration

In this section I will be exploring the dataset in regards of different aspects such as the completeness, the data type, and more. This will guide me in the data cleaning process

In [3]:
# Loading the data
mist_data = pd.read_csv("../data/Mistplay_challenge_data_science.csv") 

In [5]:
# Analyzing each of the variables in the dataset
mist_data.describe()

Unnamed: 0,Target variable Y,Gender,Year of birth,User attribute 1,User attribute 2,User attribute 3,User attribute 4,User attribute 5,User attribute 6,User attribute 7,User attribute 8,User attribute 9,User attribute 11,User attribute 12
count,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
mean,0.014901,0.392039,1989.784778,28.215222,0.021602,2.40114,0.721972,0.027241,0.165094,0.055006,74.493949,0.037804,0.350835,0.540254
std,0.121165,0.52875,11.395502,11.395502,0.158551,2.319327,0.844542,0.033359,0.111647,0.369055,49.215692,0.190731,0.477255,0.498402
min,0.0,0.0,1918.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,0.0,0.0,1984.0,20.0,0.0,1.0,0.0,0.0,0.080745,0.0,42.0,0.0,0.0,0.0
50%,0.0,0.0,1993.0,25.0,0.0,2.0,1.0,0.018692,0.144578,0.0,61.0,0.0,0.0,1.0
75%,0.0,1.0,1998.0,34.0,0.0,3.0,1.0,0.041096,0.226467,0.0,93.0,0.0,1.0,1.0
max,1.0,2.0,2006.0,100.0,3.0,26.0,4.0,0.320513,1.0,7.0,976.0,1.0,1.0,1.0


In [6]:
# Data type and missing values
mist_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Target variable Y   9999 non-null   int64  
 1   User id             9999 non-null   object 
 2   Phone OS version    9999 non-null   object 
 3   Phone device model  9999 non-null   object 
 4   Countries           9999 non-null   object 
 5   Gender              9999 non-null   int64  
 6   Year of birth       9999 non-null   int64  
 7   User source         8230 non-null   object 
 8   User attribute 1    9999 non-null   int64  
 9   User attribute 2    9999 non-null   int64  
 10  User attribute 3    9999 non-null   int64  
 11  User attribute 4    9999 non-null   int64  
 12  User attribute 5    9999 non-null   float64
 13  User attribute 6    9999 non-null   float64
 14  User attribute 7    9999 non-null   float64
 15  User attribute 8    9999 non-null   int64  
 16  User a

In [7]:
mist_data.columns

Index(['Target variable Y', 'User id', 'Phone OS version',
       'Phone device model', 'Countries', 'Gender', 'Year of birth',
       'User source', 'User attribute 1', 'User attribute 2',
       'User attribute 3', 'User attribute 4', 'User attribute 5',
       'User attribute 6', 'User attribute 7', 'User attribute 8',
       'User attribute 9', 'User attribute 10', 'User attribute 11',
       'User attribute 12'],
      dtype='object')

## Variable Exploration

This section will help me learning about the target and the different variables the data has

In [56]:
categorical_features = ['Phone OS version',
       'Phone device model', 'Countries','User source', 'Year of birth', 'User attribute 10']

In [57]:
# This section counts the number of observations per level of each variable
for feat in categorical_features:
    print('Feature: %s' %(feat))
    print('------------')
    print(mist_data[feat].value_counts())
    print('\n\n')

Feature: Phone OS version
------------
8.0.0    3454
7        1728
8.1.0    1474
7.1.1     931
6.0.1     905
9         474
7.1.2     309
5.1.1     295
6         207
4.4.2      61
5.1        55
5          37
5.0.1      28
5.0.2      26
4.4.4      12
7.1         1
4.4.3       1
4.4         1
Name: Phone OS version, dtype: int64



Feature: Phone device model
------------
samsung_SM-G950F          217
samsung_SM-G960U          207
samsung_SM-G950U          192
samsung_SM-G965U          143
samsung_SM-G930F          140
                         ... 
IUNI_IUNI N1                1
motorola_XT1031             1
samsung_SM-G955N            1
HUAWEI_HUAWEI MT7-TL10      1
Xiaomi_Redmi 4              1
Name: Phone device model, Length: 1319, dtype: int64



Feature: Countries
------------
US          5765
GB           870
CA           687
FR           411
SG           407
SE           331
FI           328
AU           297
NZ           278
DK           212
DE           186
NO           134
CA_US 

## Data cleaning & Feature Engineering

In this section I am going to clean the NA values and create new variables such as the age (given that we have the year of birth)

In [121]:
# This section converts the bi-level variable into a binary one so it is easier to model
mist_data['Age'] = date.today().year - mist_data['Year of birth']

mist_data['Phone brand'] = mist_data['Phone device model'].str.extract(r"([A-Za-z]*)_")

# Reducing the number of levels in Phone brand/user source
series = pd.value_counts(mist_data['Phone brand'])
threshold = (series<= 10)

mist_data['Phone brand sh'] = np.where(mist_data['Phone brand'].isin(series[threshold].index),'Other',mist_data['Phone brand'])

series = pd.value_counts(mist_data['User source'])
threshold = series<= 10

mist_data['User source sh'] = np.where(mist_data['User source'].isin(series[threshold].index),'Other',mist_data['User source'])

# This part will be useful later for modeling. Please, note I am not using the user ID since it is not useful for modeling
categorical_features = ['Phone OS version', 'Countries','User source sh', 'User attribute 10', 'Phone brand sh']

numeric_features = ['User attribute 1', 'User attribute 2',
       'User attribute 3', 'User attribute 4', 'User attribute 5',
       'User attribute 6', 'User attribute 7', 'User attribute 8',
       'User attribute 9', 'User attribute 11',
       'User attribute 12', 'Age']

target = ['Target variable Y']

In [122]:
mist_data[categorical_features] = mist_data[categorical_features].fillna(value='MV')
mist_data.head()

Unnamed: 0,Target variable Y,User id,Phone OS version,Phone device model,Countries,Gender,Year of birth,User source,User attribute 1,User attribute 2,...,User attribute 7,User attribute 8,User attribute 9,User attribute 10,User attribute 11,User attribute 12,Age,Phone brand,Phone brand sh,User source sh
0,0,id61539,7.1.1,asus_ASUS_X00HD,FR,0,1997,MV,21,0,...,0.0,83,0,MV,0,0,23,asus,asus,MV
1,0,id131313,7,samsung_SM-A310F,DK,0,1999,MV,19,0,...,0.0,102,0,MV,0,0,21,samsung,samsung,MV
2,0,id57636,8.0.0,samsung_SM-G935F,SE,2,1996,MV,22,0,...,0.0,94,0,MV,0,1,24,samsung,samsung,MV
3,0,id138112,8.0.0,samsung_SM-A600T,US,0,1976,MV,42,0,...,0.0,50,0,MV,0,1,44,samsung,samsung,MV
4,0,id140462,7,samsung_SM-N920P,US,0,1989,Bidalgo - Facebook,29,0,...,0.0,70,0,MV,0,0,31,samsung,samsung,Bidalgo - Facebook


In [147]:
for feat in categorical_features + target:
    print('Feature: %s' %(feat))
    print('------------')
    print(mist_data[feat].value_counts())
    print('\n\n')

Feature: Phone OS version
------------
8.0.0    3454
7        1728
8.1.0    1474
7.1.1     931
6.0.1     905
9         474
7.1.2     309
5.1.1     295
6         207
4.4.2      61
5.1        55
5          37
5.0.1      28
5.0.2      26
4.4.4      12
7.1         1
4.4.3       1
4.4         1
Name: Phone OS version, dtype: int64



Feature: Countries
------------
US          5765
GB           870
CA           687
FR           411
SG           407
SE           331
FI           328
AU           297
NZ           278
DK           212
DE           186
NO           134
CA_US         18
MX             8
MY             7
VN             6
NL             5
GB_US          2
CH             2
MX_US          2
DK_US          2
ES             2
TR             2
AT             2
TH             2
NZ_US          2
NO_SE          1
DZ             1
AU_GB          1
AU_SG          1
DK_SE          1
GB_SG          1
TR_US_VE       1
AT_DE          1
PL             1
SA             1
IE             1
CA_GB_US

As you can see there is a huge imbalance problem

In [125]:
length_data = len(mist_data['Target variable Y'])
print(f"I will work with {length_data} Observations")

I will work with 9999 Observations


In [126]:
mist_data_chopped = mist_data[numeric_features + categorical_features + target]

In [127]:
# I am using a train-test split of 70%
train, test = train_test_split(mist_data_chopped, train_size = 0.7, random_state = 1234)
train.to_csv("../data/processed_data/train_before-preprocessed.csv")

y_train = train[target]
y_test = test[target]

X_train = train[numeric_features + categorical_features]
X_test = test[numeric_features + categorical_features]

In [128]:
X_train

Unnamed: 0,User attribute 1,User attribute 2,User attribute 3,User attribute 4,User attribute 5,User attribute 6,User attribute 7,User attribute 8,User attribute 9,User attribute 11,User attribute 12,Age,Phone OS version,Countries,User source sh,User attribute 10,Phone brand sh
7408,19,0,4,1,0.093023,0.139535,0.0,43,0,0,0,21,7,US,Facebook,MV,motorola
358,24,0,1,2,0.161290,0.096774,0.0,62,0,1,1,26,8.1.0,US,Bidalgo - Facebook,MV,LGE
9389,30,0,0,0,0.015385,0.215385,0.0,65,0,0,0,32,6.0.1,US,MV,MV,LGE
1745,58,0,2,2,0.000000,0.237288,0.0,59,0,1,0,60,7,US,Bidalgo - Facebook,MV,LGE
3470,38,0,4,2,0.056338,0.112676,0.0,71,0,1,0,40,7.1.1,US,Other,MV,ZTE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,23,0,0,0,0.052632,0.052632,0.0,19,0,0,0,25,7,GB,Bidalgo - Instagram,MV,HUAWEI
7540,18,0,3,2,0.078125,0.312500,0.0,64,0,0,0,20,7,GB,Facebook,MV,samsung
7221,39,0,8,0,0.023077,0.438462,0.0,130,0,0,1,41,8.0.0,US,Bidalgo - Facebook,MV,motorola
1318,18,0,1,0,0.000000,0.289474,0.0,38,0,0,0,20,7.1.1,AU,Facebook,MV,samsung


# Preprocessing

This step is needed as some of the models use distance metric, and also categorical features must be turned into dummies

In [141]:
# For this part I am turning the categorical variables into dummy variables and the numeric variables
# into a normal distribution
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), numeric_features),
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

In [142]:
train = pd.DataFrame(preprocessor.fit_transform(X_train).toarray(),
                       index=X_train.index,
                       columns=(numeric_features +
                                list(preprocessor.named_transformers_['ohe']
                                     .get_feature_names(categorical_features))))


test = pd.DataFrame(preprocessor.transform(X_test).toarray(),
                      index=X_test.index,
                      columns=train.columns)

In [143]:
train['Y'] = y_train
test['Y'] = y_test

In [144]:
train.head()

Unnamed: 0,User attribute 1,User attribute 2,User attribute 3,User attribute 4,User attribute 5,User attribute 6,User attribute 7,User attribute 8,User attribute 9,User attribute 11,...,Phone brand sh_Vodafone,Phone brand sh_WIKO,Phone brand sh_Xiaomi,Phone brand sh_Yulong,Phone brand sh_ZTE,Phone brand sh_asus,Phone brand sh_google,Phone brand sh_motorola,Phone brand sh_samsung,Y
7408,-0.809329,-0.128863,0.690229,0.319265,1.980534,-0.232107,-0.148071,-0.630596,-0.193261,-0.727669,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
358,-0.370439,-0.128863,-0.611086,1.495336,4.033341,-0.61624,-0.148071,-0.249282,-0.193261,1.374252,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9389,0.15623,-0.128863,-1.044857,-0.856806,-0.354078,0.449277,-0.148071,-0.189075,-0.193261,-0.727669,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1745,2.614017,-0.128863,-0.177314,1.495336,-0.816697,0.646043,-0.148071,-0.30949,-0.193261,1.374252,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3470,0.858455,-0.128863,0.690229,1.495336,0.877401,-0.473389,-0.148071,-0.06866,-0.193261,1.374252,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0


In [145]:
test.head()

Unnamed: 0,User attribute 1,User attribute 2,User attribute 3,User attribute 4,User attribute 5,User attribute 6,User attribute 7,User attribute 8,User attribute 9,User attribute 11,...,Phone brand sh_Vodafone,Phone brand sh_WIKO,Phone brand sh_Xiaomi,Phone brand sh_Yulong,Phone brand sh_ZTE,Phone brand sh_asus,Phone brand sh_google,Phone brand sh_motorola,Phone brand sh_samsung,Y
2374,0.244008,-0.128863,0.256458,-0.856806,-0.816697,0.063255,-0.148071,-0.329559,-0.193261,-0.727669,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1784,-0.458217,-0.128863,-0.177314,-0.856806,-0.816697,0.126798,-0.148071,-0.710872,-0.193261,-0.727669,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
6301,0.331786,-0.128863,1.557772,-0.856806,-0.816697,0.867183,-0.148071,0.192238,-0.193261,-0.727669,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1600,-0.458217,-0.128863,2.425316,-0.856806,-0.230913,3.822739,-0.148071,1.597075,-0.193261,-0.727669,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
7920,0.858455,-0.128863,3.292859,-0.856806,-0.436061,1.243519,-0.148071,0.091892,-0.193261,1.374252,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [146]:
# Printing the data
train.to_csv("../data/processed_data/mist_train.csv")
test.to_csv("../data/processed_data/mist_test.csv")