In [2]:
import sys
import pandas as pd
import numpy as np
import datetime, time
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [8]:
from src.utils import convierte_a_minusculas

In [3]:
#### Asisting configuration
sys.path.append("..")

In [4]:
pd.set_option('display.max_rows', 200)

In [488]:
df = pd.read_csv("../data/ds_challenge_apr2021.csv")

## Limpieza

In [489]:
df = df.rename(columns={'ID_USER':'id_user'})
df['fecha'] = pd.to_datetime(df['fecha'])

In [490]:
df = convierte_a_minusculas(df)

In [491]:
df['fraude'] = df['fraude'].apply(lambda x: 1 if x==True else 0)
df['is_prime'] = df['is_prime'].apply(lambda x: 'si_prime' if x==True else 'no_prime')

In [492]:
df['tipo_tc'] = df['tipo_tc'].replace(['física'],'fisica')
df['genero'] = df['genero'].replace(['--'],'no_def')

In [493]:
df['establecimiento']=df['establecimiento'].fillna('NA_establecimiento')

In [494]:
df['ciudad']=df['ciudad'].fillna('NA_ciudad')

### Transformación de variables

In [495]:
standard_scaler = StandardScaler()

transformers = [('one_hot', OneHotEncoder(sparse = False, categories = "auto"),
                 ['tipo_tc', 'is_prime', 'genero', 'establecimiento', 'ciudad']),
               ('standar_scaler', standard_scaler, 
                ['monto', 'dcto','cashback','linea_tc','interes_tc'
                ])]
col_trans = ColumnTransformer(transformers, remainder="drop", n_jobs=-1, verbose=True)

In [496]:
df_regressors = col_trans.fit(df)
df_regressors

ColumnTransformer(n_jobs=-1,
                  transformers=[('one_hot', OneHotEncoder(sparse=False),
                                 ['tipo_tc', 'is_prime', 'genero',
                                  'establecimiento', 'ciudad']),
                                ('standar_scaler', StandardScaler(),
                                 ['monto', 'dcto', 'cashback', 'linea_tc',
                                  'interes_tc'])],
                  verbose=True)

In [497]:
df_transformed = df_regressors.transform(df)

In [498]:
A = [sorted(df.tipo_tc.unique()),
    sorted(df.is_prime.unique()),
    sorted(df.genero.unique()),
    sorted(df.establecimiento.unique()),
    sorted(df.ciudad.unique())]

colnames = sum(A, [])

In [499]:
colnames.append('monto')
colnames.append('dcto')
colnames.append('cashback')
colnames.append('linea_tc')
colnames.append('interes_tc')

In [500]:
colnames

['fisica',
 'virtual',
 'no_prime',
 'si_prime',
 'f',
 'm',
 'no_def',
 'NA_establecimiento',
 'abarrotes',
 'farmacia',
 'mpago',
 'restaurante',
 'super',
 'NA_ciudad',
 'guadalajara',
 'merida',
 'monterrey',
 'toluca',
 'monto',
 'dcto',
 'cashback',
 'linea_tc',
 'interes_tc']

In [501]:
final_df = pd.DataFrame(df_transformed, columns = colnames)

In [502]:
#df[df['fecha']=='2020-01-31']

In [503]:
df['dia'] = pd.DatetimeIndex(df['fecha']).day

In [504]:
dias = 30 

final_df['sin_dia'] = np.sin(2*np.pi*df['dia']/dias)
final_df['cos_dia'] = np.cos(2*np.pi*df['dia']/dias)

In [505]:
horas = 24 

final_df['sin_hora'] = np.sin(2*np.pi*df['hora']/horas)
final_df['cos_hora'] = np.cos(2*np.pi*df['hora']/horas)

In [506]:
df_transformed=final_df.values

In [507]:
#final_df['id']=df['id_user']

In [508]:
colnames.append('sin_dia')
colnames.append('cos_dia')
colnames.append('sin_hora')
colnames.append('cos_hora')

In [509]:
df_transformed.shape

(26975, 27)

In [510]:
final_df.shape

(26975, 27)

In [454]:
final_df

Unnamed: 0,fisica,virtual,no_prime,si_prime,f,m,no_def,NA_establecimiento,abarrotes,farmacia,...,monto,dcto,cashback,linea_tc,interes_tc,sin_dia,cos_dia,sin_hora,cos_hora,id
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.377721,1.263136,-0.176315,0.389427,0.289465,-9.510565e-01,-0.309017,-0.866025,0.500000,0
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-1.418404,-0.508714,-1.006380,0.389427,0.289465,5.665539e-16,-1.000000,0.965926,-0.258819,0
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.005752,-0.508714,2.139704,1.440304,0.811001,-4.067366e-01,-0.913545,0.258819,-0.965926,1
3,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.926139,-0.508714,0.316006,1.440304,0.811001,8.660254e-01,0.500000,0.258819,-0.965926,1
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-0.846509,-0.508714,-0.834270,1.440304,0.811001,-1.133108e-15,1.000000,0.500000,-0.866025,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26970,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.854196,-0.508714,0.269322,-1.026972,0.811001,9.945219e-01,-0.104528,-0.258819,-0.965926,3999
26971,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.530346,-0.508714,0.708078,-1.026972,0.811001,2.079117e-01,-0.978148,-0.707107,-0.707107,3999
26972,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,-0.856813,-0.508714,-0.277543,-1.026972,0.811001,-2.079117e-01,-0.978148,0.258819,-0.965926,3999
26973,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,-1.641479,-0.438280,-1.355553,-1.026972,0.811001,4.067366e-01,-0.913545,-0.965926,-0.258819,3999


## Selección de variables (VarianceThreshold)

In [461]:
from sklearn.feature_selection import VarianceThreshold

In [462]:
variance_threshold = VarianceThreshold(threshold=0.1)
variance_threshold.fit(df_transformed)

VarianceThreshold(threshold=0.1)

In [463]:
variance_threshold.transform(df_transformed)

array([[ 1.        ,  0.        ,  1.        , ..., -0.30901699,
        -0.8660254 ,  0.5       ],
       [ 0.        ,  1.        ,  1.        , ..., -1.        ,
         0.96592583, -0.25881905],
       [ 0.        ,  1.        ,  1.        , ..., -0.91354546,
         0.25881905, -0.96592583],
       ...,
       [ 0.        ,  1.        ,  1.        , ..., -0.9781476 ,
         0.25881905, -0.96592583],
       [ 1.        ,  0.        ,  1.        , ..., -0.91354546,
        -0.96592583, -0.25881905],
       [ 1.        ,  0.        ,  1.        , ...,  0.80901699,
        -0.8660254 ,  0.5       ]])

In [458]:
#variance_threshold.variances_

In [466]:
impor_var = pd.DataFrame(colnames ,variance_threshold.variances_)

In [467]:
impor_var['predictores'] = impor_var.loc[:,0]

In [468]:
impor_var.reset_index().iloc[:, [2,0]].sort_values(by = 'index', ascending=False)

Unnamed: 0,predictores,index
20,cashback,1.0
22,interes_tc,1.0
21,linea_tc,1.0
19,dcto,1.0
18,monto,1.0
25,sin_hora,0.525616
23,sin_dia,0.514248
24,cos_dia,0.484718
26,cos_hora,0.472809
5,m,0.249999


### Selección de variables (RF)

In [469]:
y = df['fraude'].values
y

array([0, 0, 0, ..., 0, 0, 0])

In [470]:
X = df_transformed
X

array([[ 1.        ,  0.        ,  1.        , ..., -0.30901699,
        -0.8660254 ,  0.5       ],
       [ 0.        ,  1.        ,  1.        , ..., -1.        ,
         0.96592583, -0.25881905],
       [ 0.        ,  1.        ,  1.        , ..., -0.91354546,
         0.25881905, -0.96592583],
       ...,
       [ 0.        ,  1.        ,  1.        , ..., -0.9781476 ,
         0.25881905, -0.96592583],
       [ 1.        ,  0.        ,  1.        , ..., -0.91354546,
        -0.96592583, -0.25881905],
       [ 1.        ,  0.        ,  1.        , ...,  0.80901699,
        -0.8660254 ,  0.5       ]])

In [471]:
print(X.shape, y.shape)

(26975, 27) (26975,)


In [472]:
# ocuparemos un RF
classifier = RandomForestClassifier(oob_score=True, random_state=1234)

# separando en train, test
X_train, X_test, y_train, y_test = train_test_split(X, y)

# definicion de los hiperparametros que queremos probar
hyper_param_grid = {'n_estimators': [100,200,300], 
                    'max_depth': [1, 2, 3],
                    'min_samples_split': [2, 3, 5, 7]}

# ocupemos grid search!
gs = GridSearchCV(classifier, 
                           hyper_param_grid, 
                           scoring = 'precision', #
                           cv = 5, 
                           n_jobs = -1)

start_time = time.time()
gs.fit(X, y)
print("Tiempo de ejecución: ", time.time() - start_time)

Tiempo de ejecución:  203.53188395500183


In [473]:
gs.best_params_

{'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 100}

In [474]:
gs.best_estimator_

RandomForestClassifier(max_depth=1, oob_score=True, random_state=1234)

In [475]:
gs.best_estimator_.oob_score_

0.9699721964782205

In [476]:
etiquetas_predichas = gs.predict(X_test)

In [477]:
etiquetas_predichas

array([0, 0, 0, ..., 0, 0, 0])

In [478]:
score_predicho = gs.predict_proba(X_test)
score_predicho

array([[0.97015106, 0.02984894],
       [0.969568  , 0.030432  ],
       [0.96980907, 0.03019093],
       ...,
       [0.96945524, 0.03054476],
       [0.96961627, 0.03038373],
       [0.96997128, 0.03002872]])

In [479]:
gs.best_estimator_.feature_importances_

array([0.  , 0.  , 0.01, 0.01, 0.02, 0.02, 0.  , 0.04, 0.  , 0.01, 0.03,
       0.02, 0.02, 0.01, 0.01, 0.  , 0.01, 0.02, 0.08, 0.13, 0.11, 0.09,
       0.06, 0.06, 0.09, 0.08, 0.07])

In [480]:
importancias = pd.DataFrame(colnames ,gs.best_estimator_.feature_importances_)

In [481]:
importancias['predictores'] = importancias.loc[:,0]

In [482]:
importancias.reset_index().iloc[:, [2,0]].sort_values(by = 'index', ascending=False)

Unnamed: 0,predictores,index
19,dcto,0.13
20,cashback,0.11
24,cos_dia,0.09
21,linea_tc,0.09
25,sin_hora,0.08
18,monto,0.08
26,cos_hora,0.07
23,sin_dia,0.06
22,interes_tc,0.06
7,NA_establecimiento,0.04


In [483]:
final_df

Unnamed: 0,fisica,virtual,no_prime,si_prime,f,m,no_def,NA_establecimiento,abarrotes,farmacia,...,monto,dcto,cashback,linea_tc,interes_tc,sin_dia,cos_dia,sin_hora,cos_hora,id
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.377721,1.263136,-0.176315,0.389427,0.289465,-9.510565e-01,-0.309017,-0.866025,0.500000,0
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-1.418404,-0.508714,-1.006380,0.389427,0.289465,5.665539e-16,-1.000000,0.965926,-0.258819,0
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.005752,-0.508714,2.139704,1.440304,0.811001,-4.067366e-01,-0.913545,0.258819,-0.965926,1
3,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.926139,-0.508714,0.316006,1.440304,0.811001,8.660254e-01,0.500000,0.258819,-0.965926,1
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-0.846509,-0.508714,-0.834270,1.440304,0.811001,-1.133108e-15,1.000000,0.500000,-0.866025,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26970,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.854196,-0.508714,0.269322,-1.026972,0.811001,9.945219e-01,-0.104528,-0.258819,-0.965926,3999
26971,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.530346,-0.508714,0.708078,-1.026972,0.811001,2.079117e-01,-0.978148,-0.707107,-0.707107,3999
26972,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,-0.856813,-0.508714,-0.277543,-1.026972,0.811001,-2.079117e-01,-0.978148,0.258819,-0.965926,3999
26973,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,-1.641479,-0.438280,-1.355553,-1.026972,0.811001,4.067366e-01,-0.913545,-0.965926,-0.258819,3999


In [484]:
import joblib

def save_df(df, path):
    joblib.dump(df, path)

In [513]:
# Agregamos etiqueta antes de guardar
final_df['fraude']=df['fraude']
final_df['fecha']=df['fecha']

In [514]:
save_df(final_df,'../tmp/fe_df.pkl' )