## Pre processing

In [1]:
import pandas as pd
import numpy as np
import joblib
import pickle
import os

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

# Transformers
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from lib.transformers import\
                            FeatureNamer,\
                            FeatureRemover,\
                            CustomLabelEncoder,\
                            AgeReplacer,\
                            TimeFeatureCreator,\
                            ColumnSelector

# removendo avisos de atualizacao de bibliotecas
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Macros

DATA_PATH = 'data/df_train.csv'
PREPARATION_PATH = 'outputs/data_pipeline.sav'

In [3]:
df = pd.read_csv(DATA_PATH)
df.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,192293,141719,16671,87691,116909
id,wqpe22hwjb,l144s4m9mu,y6extte83s,1u2r6m6ets,oleaq2ki3k
date_account_created,2014-05-20,2014-01-14,2012-02-09,2013-07-01,2013-10-15
timestamp_first_active,20140520182222,20140114180853,20120209001110,20130701060451,20131015032212
gender,-unknown-,FEMALE,-unknown-,MALE,-unknown-
age,,28.0,,30.0,57.0
signup_method,basic,basic,basic,basic,basic
signup_flow,0,0,0,0,0
language,en,en,en,en,en
affiliate_channel,sem-non-brand,other,direct,direct,direct


In [4]:
# Removing the target variable and attributes that don't add usefull information to the model
X = df.copy()
X.drop(['Unnamed: 0', 'country_destination', 'id'], axis=1, inplace=True)
X.head(2)

Unnamed: 0,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,2014-05-20,20140520182222,-unknown-,,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,IE
1,2014-01-14,20140114180853,FEMALE,28.0,basic,0,en,other,other,omg,Web,Mac Desktop,Safari


In [5]:
# Splitting columns into numerical and categorical features

features = X.columns
time_features = ['date_account_created', 'timestamp_first_active']

num_features = X.drop(time_features, axis=1).select_dtypes(include=np.number).columns.tolist()
df_num = X[num_features]
print('Numéricos: ', num_features)

cat_features = X.drop(time_features, axis=1).select_dtypes(exclude=np.number).columns.tolist()
df_cat = X[cat_features]
print('Categóricos: ', cat_features)

Numéricos:  ['age', 'signup_flow']
Categóricos:  ['gender', 'signup_method', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']


# Feature Engineering with time data

In [6]:
time_transformer = Pipeline(steps=[
    ('selector', ColumnSelector(time_features)),
    ('feature_engineering', TimeFeatureCreator())])
df_time = time_transformer.fit_transform(X)
df_time.head()

Unnamed: 0,timestamp_first_active,first_active_on_creation_date,register_year,register_month,register_day,register_weekday
0,20140520182222,1,2014,5,20,2
1,20140114180853,1,2014,1,14,2
2,20120209001110,1,2012,2,9,4
3,20130701060451,1,2013,7,1,1
4,20131015032212,1,2013,10,15,2


# Categorical Variables

In [7]:
cat_transformer = Pipeline(steps=[
    ('selector', ColumnSelector(cat_features)),
    ('imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
    ('namer', FeatureNamer(columns=cat_features))
], verbose=True)

df_cat = cat_transformer.fit_transform(X)
df_cat.describe().transpose()

[Pipeline] .......... (step 1 of 3) Processing selector, total=   0.0s
[Pipeline] ........... (step 2 of 3) Processing imputer, total=   0.1s
[Pipeline] ............. (step 3 of 3) Processing namer, total=   0.0s


Unnamed: 0,count,unique,top,freq
gender,146082,4,-unknown-,67053
signup_method,146082,3,basic,104491
language,146082,25,en,141210
affiliate_channel,146082,8,direct,93838
affiliate_provider,146082,17,direct,93648
first_affiliate_tracked,146082,8,untracked,74341
signup_app,146082,4,Web,124263
first_device_type,146082,9,Mac Desktop,60666
first_browser,146082,47,Chrome,43080


## Label encoder

In [8]:
cat_transformer.steps.append(('normalizar', CustomLabelEncoder()))
df_cat = cat_transformer.fit_transform(X)
df_cat.head()

[Pipeline] .......... (step 1 of 4) Processing selector, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing imputer, total=   0.1s
[Pipeline] ............. (step 3 of 4) Processing namer, total=   0.0s
[Pipeline] ........ (step 4 of 4) Processing normalizar, total=   0.4s


Unnamed: 0,gender,signup_method,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,0,0,5,6,7,4,2,6,18
1,1,0,5,3,11,4,2,3,36
2,0,0,5,2,3,0,2,4,0
3,2,0,5,2,3,4,2,3,8
4,0,0,5,2,3,7,2,7,25


# Numerical Features handling

## Missing values imputing

In [9]:
print('Quantidade de valores inválidos (NaN) categóricos ou numéricos:',
      X.isna().sum().sum())

print('Quantidade valores negativos (só numéricos):',
      df_num[df_num < 0].dropna().shape[0])

Quantidade de valores inválidos (NaN) categóricos ou numéricos: 67019
Quantidade valores negativos (só numéricos): 0


In [10]:
num_transformer = Pipeline(steps=[
    ('selector', ColumnSelector(num_features)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('namer', FeatureNamer(columns=num_features))
])

df_num = num_transformer.fit_transform(X)

# Verificando se os valores foram realmente substituídos
print('Quantidade de valores inválidos:', df_num.isna().sum().sum())

Quantidade de valores inválidos: 0


## Dealing with incorrect ages

In [11]:
MIN_AGE = 7
MAX_AGE = 120

print(f"Valores acima do limiar de idade: {np.sort(df_num[df_num['age'] >= MAX_AGE]['age'].unique())}")
print(f"Valores abaixo do limiar de idade: {np.sort(df_num[df_num['age'] < MIN_AGE]['age'].unique())}")

Valores acima do limiar de idade: [1924. 1926. 1927. 1928. 1929. 1931. 1932. 1933. 1935. 1936. 1938. 1942.
 1947. 1949. 1952. 1953. 1995. 2008. 2013. 2014.]
Valores abaixo do limiar de idade: [1. 2. 4. 5.]


In [12]:
num_transformer.steps.append(('age_outlier_replacement', AgeReplacer('age', MIN_AGE, MAX_AGE)))

df_num = num_transformer.fit_transform(X)

print(f"Valores acima do limiar de idade: {np.sort(df_num[df_num['age'] >= MAX_AGE]['age'].unique())}")
print(f"Valores abaixo do limiar de idade: {np.sort(df_num[df_num['age'] < MIN_AGE]['age'].unique())}")

Valores acima do limiar de idade: []
Valores abaixo do limiar de idade: []


## Feature normalization

In [13]:
df_num.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,146082.0,42.409145,12.195242,7.0,32.0,49.266314,49.266314,115.0
signup_flow,146082.0,3.366062,7.750881,0.0,0.0,0.0,0.0,25.0


In [14]:
num_transformer.steps.append(('normalizer', MinMaxScaler()))
num_transformer.steps.append(('name', FeatureNamer(columns=num_features)))

df_num = pd.DataFrame(num_transformer.fit_transform(X), columns=num_features)

In [15]:
df_num.describe() \
      .transpose() \
      .sort_index()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,146082.0,0.327862,0.112919,0.0,0.231481,0.391355,0.391355,1.0
signup_flow,146082.0,0.134642,0.310035,0.0,0.0,0.0,0.0,1.0


# Saving data pre processing pipeline

In [16]:
features = df_time.join(df_num).join(df_cat).columns

preprocessing_pipeline = FeatureUnion([
    ('time_transformer', time_transformer),
    ('num_transformer', num_transformer),
    ('cat_transformer', cat_transformer),
])

final_pipeline = Pipeline([
    ('pre processing pipeline', preprocessing_pipeline),
    ('namer', FeatureNamer(features)),
])

In [17]:
processado = final_pipeline.fit_transform(X)
processado.head()

[Pipeline] .......... (step 1 of 4) Processing selector, total=   0.0s
[Pipeline] ........... (step 2 of 4) Processing imputer, total=   0.1s
[Pipeline] ............. (step 3 of 4) Processing namer, total=   0.0s
[Pipeline] ........ (step 4 of 4) Processing normalizar, total=   0.4s


Unnamed: 0,timestamp_first_active,first_active_on_creation_date,register_year,register_month,register_day,register_weekday,age,signup_flow,gender,signup_method,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,20140520000000.0,1.0,2014.0,5.0,20.0,2.0,0.391355,0.0,0.0,0.0,5.0,6.0,7.0,4.0,2.0,6.0,18.0
1,20140110000000.0,1.0,2014.0,1.0,14.0,2.0,0.194444,0.0,1.0,0.0,5.0,3.0,11.0,4.0,2.0,3.0,36.0
2,20120210000000.0,1.0,2012.0,2.0,9.0,4.0,0.391355,0.0,0.0,0.0,5.0,2.0,3.0,0.0,2.0,4.0,0.0
3,20130700000000.0,1.0,2013.0,7.0,1.0,1.0,0.212963,0.0,2.0,0.0,5.0,2.0,3.0,4.0,2.0,3.0,8.0
4,20131020000000.0,1.0,2013.0,10.0,15.0,2.0,0.462963,0.0,0.0,0.0,5.0,2.0,3.0,7.0,2.0,7.0,25.0


In [18]:
joblib.dump(final_pipeline, PREPARATION_PATH)

['outputs/data_pipeline.sav']