In [13]:
import sys
import pandas as pd
import numpy as np
import datetime, time
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from src.utils.utils import convierte_a_minusculas

In [14]:
#### Asisting configuration
sys.path.append("..")

In [15]:
pd.set_option('display.max_rows', 200)

## Preprocesamiento y limpieza

In [16]:
def preprocesamiento_limpieza(data):
    """
    Preprocesamiento y limpieza de datos
    :param data: dataframe
    :return: dataframe en minúsculas
    ==========
    Ejemplo:
        >> dataframe = preprocesamiento_limpieza(dataframe)
    """

    data = convierte_a_minusculas(data)
    data = data.rename(columns={'ID_USER': 'id_user'})
    data['fecha'] = pd.to_datetime(data['fecha'])
    data['fraude'] = data['fraude'].apply(lambda x: 1 if x == True else 0)
    data['is_prime'] = data['is_prime'].apply(lambda x: 'si_prime' if x == True else 'no_prime')
    data['tipo_tc'] = data['tipo_tc'].replace(['física'], 'fisica')
    data['genero'] = data['genero'].replace(['--'], 'no_def')
    data['establecimiento'] = data['establecimiento'].fillna('NA_establecimiento')
    data['ciudad'] = data['ciudad'].fillna('NA_ciudad')
    return data

In [17]:
df = pd.read_csv("../data/ds_challenge_apr2021.csv")

In [18]:
df=preprocesamiento_limpieza(df)

In [19]:
df.head(5)

Unnamed: 0,id_user,genero,monto,fecha,hora,dispositivo,establecimiento,ciudad,tipo_tc,linea_tc,interes_tc,status_txn,is_prime,dcto,cashback,fraude
0,0,f,608.345634,2020-01-21,20,"{'model': 2020, 'device_score': 3, 'os': 'andr...",super,merida,fisica,71000,51,aceptada,no_prime,60.834563,5.475111,0
1,0,f,88.719243,2020-01-15,7,"{'model': 2020, 'device_score': 1, 'os': 'andr...",NA_establecimiento,merida,virtual,71000,51,aceptada,no_prime,0.0,1.774385,0
2,1,f,790.037653,2020-01-17,11,"{'model': 2020, 'device_score': 1, 'os': '.'}",NA_establecimiento,guadalajara,virtual,94000,56,en proceso,no_prime,0.0,15.800753,0
3,1,f,767.005387,2020-01-05,11,"{'model': 2020, 'device_score': 3, 'os': 'web'}",mpago,guadalajara,fisica,94000,56,aceptada,no_prime,0.0,7.670054,0
4,1,f,254.171053,2020-01-30,10,"{'model': 2020, 'device_score': 3, 'os': '%%'}",NA_establecimiento,guadalajara,fisica,94000,56,aceptada,no_prime,0.0,2.541711,0


## Feature Engineering

In [22]:
def one_hot_scaler(data):
    standard_scaler = StandardScaler()

    transformers = [('one_hot',
                     OneHotEncoder(sparse=False,
                                   categories="auto"),
                     ['tipo_tc', 'is_prime', 'genero',
                      'establecimiento', 'ciudad']),
                    ('standar_scaler',
                     standard_scaler,
                     ['monto', 'dcto', 'cashback',
                      'linea_tc', 'interes_tc'
                      ])]

    col_trans = ColumnTransformer(transformers, remainder="drop",
                                  n_jobs=-1, verbose=True)

    df_regressors = col_trans.fit(data)
    df_transformed = df_regressors.transform(data)

    a = [sorted(data.tipo_tc.unique()),
         sorted(data.is_prime.unique()),
         sorted(data.genero.unique()),
         sorted(data.establecimiento.unique()),
         sorted(data.ciudad.unique())]

    colnames = sum(a, [])

    colnames.append('monto')
    colnames.append('dcto')
    colnames.append('cashback')
    colnames.append('linea_tc')
    colnames.append('interes_tc')

    final_df = pd.DataFrame(df_transformed, columns=colnames)

    final_df['fraude'] = data['fraude']
    final_df['fecha'] = data['fecha']
    final_df['hora'] = data['hora']

    return final_df

In [23]:
def horas_dias_ciclo(data):
    data['dia'] = pd.DatetimeIndex(data['fecha']).day

    dias = 30

    data['sin_dia'] = np.sin(2 * np.pi * data['dia'] / dias)
    data['cos_dia'] = np.cos(2 * np.pi * data['dia'] / dias)

    data = data.drop('fecha', 1)
    data = data.drop('dia', 1)

    horas = 24

    data['sin_hora'] = np.sin(2 * np.pi * data['hora'] / horas)
    data['cos_hora'] = np.cos(2 * np.pi * data['hora'] / horas)

    data = data.drop('hora', 1)

    return data

In [25]:
final_df = one_hot_scaler(df)

In [26]:
final_df = horas_dias_ciclo(final_df)

In [28]:
final_df.head(5)

Unnamed: 0,fisica,virtual,no_prime,si_prime,f,m,no_def,NA_establecimiento,abarrotes,farmacia,...,monto,dcto,cashback,linea_tc,interes_tc,fraude,sin_dia,cos_dia,sin_hora,cos_hora
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.377721,1.263136,-0.176315,0.389427,0.289465,0,-0.9510565,-0.309017,-0.866025,0.5
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-1.418404,-0.508714,-1.00638,0.389427,0.289465,0,5.665539e-16,-1.0,0.965926,-0.258819
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.005752,-0.508714,2.139704,1.440304,0.811001,0,-0.4067366,-0.913545,0.258819,-0.965926
3,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.926139,-0.508714,0.316006,1.440304,0.811001,0,0.8660254,0.5,0.258819,-0.965926
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-0.846509,-0.508714,-0.83427,1.440304,0.811001,0,-1.133108e-15,1.0,0.5,-0.866025


## Selección de variables (VarianceThreshold)

In [31]:
from sklearn.feature_selection import VarianceThreshold

In [33]:
df_transformed = final_df.drop('fraude',1).values

In [34]:
variance_threshold = VarianceThreshold(threshold=0.1)
variance_threshold.fit(df_transformed)

VarianceThreshold(threshold=0.1)

In [146]:
#variance_threshold.transform(df_transformed)

In [38]:
colnames = final_df.drop('fraude',1).columns

In [39]:
impor_var = pd.DataFrame(colnames ,variance_threshold.variances_)

In [40]:
impor_var['predictores'] = impor_var.loc[:,0]

In [41]:
impor_var.reset_index().iloc[:, [2,0]].sort_values(by = 'index', ascending=False)

Unnamed: 0,predictores,index
20,cashback,1.0
22,interes_tc,1.0
21,linea_tc,1.0
19,dcto,1.0
18,monto,1.0
25,sin_hora,0.525616
23,sin_dia,0.514248
24,cos_dia,0.484718
26,cos_hora,0.472809
5,m,0.249999


### Selección de variables (RF)

In [145]:
y = df['fraude'].values

In [144]:
X = final_df.drop('fraude',1).values

In [44]:
print(X.shape, y.shape)

(26975, 27) (26975,)


In [71]:
df_no_fraude = df[df['fraude'] == 0]
df_si_fraude = df[df['fraude'] == 1]

In [79]:
muestra_no_fraude = df_no_fraude.sample(n=810,random_state=12)
muestra_si_fraude = df_si_fraude

In [80]:
df_nuevo = pd.concat([muestra_si_fraude,muestra_no_fraude],axis=0)

In [91]:
data=preprocesamiento_limpieza(df_nuevo)

In [100]:
standard_scaler = StandardScaler()

transformers = [('one_hot',
                 OneHotEncoder(sparse=False,
                               categories="auto"),
                 ['tipo_tc', 'is_prime', 'genero',
                  'establecimiento', 'ciudad']),
                ('standar_scaler',
                 standard_scaler,
                 ['monto', 'dcto', 'cashback',
                  'linea_tc', 'interes_tc'
                  ])]

col_trans = ColumnTransformer(transformers, remainder="drop",
                              n_jobs=-1, verbose=True)

df_regressors = col_trans.fit(data)
df_transformed = df_regressors.transform(data)

a = [sorted(data.tipo_tc.unique()),
     sorted(data.is_prime.unique()),
     sorted(data.genero.unique()),
     sorted(data.establecimiento.unique()),
     sorted(data.ciudad.unique())]

colnames = sum(a, [])
colnames.append('monto')
colnames.append('dcto')
colnames.append('cashback')
colnames.append('linea_tc')
colnames.append('interes_tc')

final_df = pd.DataFrame(df_transformed, columns=colnames)

In [102]:
final_df['fraude'] = df_nuevo['fraude'].values
final_df['fecha'] = df_nuevo['fecha'].values
final_df['hora'] = df_nuevo['hora'].values

In [110]:
final_df_nuevo = horas_dias_ciclo(final_df)

In [114]:
X = final_df_nuevo.drop(columns=['fraude'], axis=1).values
y = final_df_nuevo['fraude'].values

In [115]:
# ocuparemos un RF
classifier = RandomForestClassifier(oob_score=True, random_state=1234)

# separando en train, test
X_train, X_test, y_train, y_test = train_test_split(X, y)

# definicion de los hiperparametros que queremos probar
hyper_param_grid = {'n_estimators': [100,200,300], 
                    'max_depth': [1, 2, 3],
                    'min_samples_split': [2, 3, 5, 7]}

# ocupemos grid search!
gs = GridSearchCV(classifier, 
                           hyper_param_grid, 
                           scoring = 'precision', #
                           cv = 5, 
                           n_jobs = -1)

start_time = time.time()
gs.fit(X_train, y_train)
print("Tiempo de ejecución: ", time.time() - start_time)

Tiempo de ejecución:  74.46220397949219


In [143]:
#gs.best_params_

In [142]:
#gs.best_estimator_

In [141]:
#gs.best_estimator_.oob_score_

In [119]:
etiquetas_predichas = gs.predict(X_test)

In [122]:
score_predicho = gs.predict_proba(X_test)

In [137]:
colnames=final_df_nuevo.drop('fraude',1).columns

In [138]:
importancias = pd.DataFrame(colnames ,gs.best_estimator_.feature_importances_)

In [139]:
importancias['predictores'] = importancias.loc[:,0]

## Variables seleccionadas de acuerdo al modelo

In [140]:
importancias.reset_index().iloc[:, [2,0]].sort_values(by = 'index', ascending=False)

Unnamed: 0,predictores,index
19,cashback,0.153146
17,monto,0.122936
20,linea_tc,0.11791
18,dcto,0.091119
25,cos_hora,0.075046
21,interes_tc,0.072162
23,cos_dia,0.061191
24,sin_hora,0.050217
22,sin_dia,0.040305
1,virtual,0.025472
