# Importar pacotes necessarios

In [1]:
import pandas as pd
import numpy as np
import operator
import gc
import pickle

from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from lightgbm import LGBMClassifier

# Carregar datasets

In [2]:
df_transaction = catalog.load('train_transaction')
df_id = catalog.load('train_identity')

2022-05-18 10:31:37,925 - kedro.io.data_catalog - INFO - Loading data from `train_transaction` (CSVDataSet)...
2022-05-18 10:32:20,612 - kedro.io.data_catalog - INFO - Loading data from `train_identity` (CSVDataSet)...


Realizar o cruzamento dos dois datasets para que estejam em um so DataFrame

In [3]:
df_join = df_transaction.merge(df_id, how='left', left_index=True, right_index=True)

In [4]:
del df_transaction
del df_id
gc.collect()

3

Total de colunas

In [5]:
df_join.shape[1]

433

# Tratamento dos Dados

Verificar o percentual de nulos da base

In [6]:
df_aux = pd.DataFrame(df_join.isna().sum(), columns=['count_na'])

In [7]:
df_aux['perc_na'] = df_aux['count_na']/df_join.shape[0]

In [8]:
df_aux

Unnamed: 0,count_na,perc_na
isFraud,0,0.000000
TransactionDT,0,0.000000
TransactionAmt,0,0.000000
ProductCD,0,0.000000
card1,0,0.000000
...,...,...
id_36,303998,0.739324
id_37,303998,0.739324
id_38,303998,0.739324
DeviceType,304145,0.739681


Retirar  colunas que possuam mais do que 80% dos valores nulos

In [9]:
cols_not_na = list(df_aux.loc[df_aux.perc_na < 0.8].index)

In [10]:
len(cols_not_na)

359

In [11]:
df_join2 = df_join[cols_not_na]

Selecionar colunas categoricas

In [12]:
object_cols = list(df_join2.select_dtypes('object').columns)

Selecionar colunas numericas (sem a variavel resposta)

In [13]:
df_aux = df_join2.select_dtypes('number').drop(['isFraud'], axis=1).fillna(0)

Selecionar colunas que possuem correlacao menor do que 70%

In [14]:
# list_corr = []
# for col in df_aux.columns:
#     cols_ = list(set(list(df_aux.columns)) - set(list_corr))
#     aux_series = df_aux[cols_].corrwith(df_aux[col])
#     if aux_series[(aux_series >= 0.7) & (aux_series <= 0.9999)].shape[0] != 0:
#         list_corr.append(col)
#     else:
#         pass    

In [15]:
# with open('list_corr.pickle', 'wb') as f0:
#     pickle.dump(list_corr, f0)

In [16]:
with open('list_corr.pickle', 'rb') as f0:
    list_corr = pickle.load(f0)

In [17]:
final_cols = object_cols + list_corr
final_cols.append('isFraud')

Selecionar colunas 

In [18]:
df_aux1 = df_join2[final_cols].fillna(0)

In [19]:
del df_join
del df_join2
del df_aux
gc.collect()

0

Transformar as variaveis categoricas em ordinais

In [20]:
oe = OrdinalEncoder()

aux1_categ = oe.fit_transform(df_aux1.select_dtypes('object').astype(str))
df_aux1_categ = pd.DataFrame(aux1_categ, columns=df_aux1.select_dtypes('object').columns, 
    index=df_aux1.index
    )

In [21]:
df_aux2 = df_aux1.drop(df_aux1.select_dtypes('object'), axis=1)\
    .merge(df_aux1_categ, on=df_aux1.index)\
    .drop('key_0', axis=1)

Calcular o Mutual Information para verificar a importancia das variaveis

In [22]:
# mi = mutual_info_classif(df_aux2.to_numpy(), df_aux2.isFraud.to_numpy())

In [23]:
# with open('mi.pickle', 'wb') as f:
#     pickle.dump(mi, f)

In [24]:
with open('mi.pickle', 'rb') as f:
    mi = pickle.load(f)

In [25]:
mi_columns = df_aux2.columns

In [26]:
del df_aux1
del df_aux1_categ

gc.collect()

0

Separar a base em treino e teste

In [27]:
X = df_aux2.drop('isFraud', axis=1)
y = df_aux2['isFraud']

Standard Scaler

In [28]:
sc = StandardScaler()

In [33]:
X2 = pd.DataFrame(sc.fit_transform(X), columns=X.columns, index=X.index)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, random_state=42, test_size=0.30)

In [37]:
del X
del X2
del y

gc.collect()

959

# Treinamento

Treinar o modelo para ser utilizado como padrao

In [38]:
model = LGBMClassifier(random_state=42, max_depth=7)

In [39]:
model.fit(X_train, y_train)

LGBMClassifier(max_depth=7, random_state=42)

In [40]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [41]:
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    277732
           1       0.93      0.43      0.59     10096

    accuracy                           0.98    287828
   macro avg       0.96      0.71      0.79    287828
weighted avg       0.98      0.98      0.98    287828



In [42]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    118992
           1       0.89      0.41      0.56      4364

    accuracy                           0.98    123356
   macro avg       0.94      0.71      0.78    123356
weighted avg       0.98      0.98      0.97    123356



<p style="color:blue">Aprendizado Federado Vertical - colunas separadas sem criterio</p>

Treinar o modelo separadamente e enviar os resultados para um parte central, que ira consolidar os resultados e treinar um novo modelo com essas informacoes

- Separar as colunas

In [43]:
columns_half = int(len(df_aux2.columns.drop('isFraud'))/2)

In [44]:
columns1 = df_aux2.columns.drop('isFraud')[1:columns_half]
columns2 = df_aux2.columns.drop('isFraud')[columns_half:]

In [45]:
X_train_1 = X_train[columns1]
X_train_2 = X_train[columns2]

X_test_1 = X_test[columns1]
X_test_2 = X_test[columns2]

- Treinar os dois modelos localmente

In [46]:
model_bank1 = LGBMClassifier(random_state=42)
model_bank2 = LGBMClassifier(random_state=42)

In [47]:
model_bank1.fit(X_train_1, y_train)
model_bank2.fit(X_train_2, y_train)

LGBMClassifier(random_state=42)

- Preparar as bases para envio 

In [48]:
pred1 = model_bank1.predict_proba(X_train_1)
pred2 = model_bank2.predict_proba(X_train_2)

pred1_test = model_bank1.predict_proba(X_test_1)
pred2_test = model_bank2.predict_proba(X_test_2)

In [49]:
pred_final = []

for i in range(len(X_train_1)):
    pred_final.append(np.hstack((pred1[i], pred2[i])))

In [50]:
pred_final_test = []

for i in range(len(X_test_1)):
    pred_final_test.append(np.hstack((pred1_test[i], pred2_test[i])))

- Treino do novo modelo no servidor central

In [51]:
model_final = MLPClassifier(
    hidden_layer_sizes=(500, 100, 30), learning_rate_init=0.001, verbose=True
)

In [52]:
model_final.fit(pred_final, y_train.reset_index()['isFraud'])

Iteration 1, loss = 0.08133433
Iteration 2, loss = 0.07622985
Iteration 3, loss = 0.07575908
Iteration 4, loss = 0.07555859
Iteration 5, loss = 0.07557893
Iteration 6, loss = 0.07538105
Iteration 7, loss = 0.07541593
Iteration 8, loss = 0.07526647
Iteration 9, loss = 0.07526692
Iteration 10, loss = 0.07518245
Iteration 11, loss = 0.07525846
Iteration 12, loss = 0.07515714
Iteration 13, loss = 0.07508917
Iteration 14, loss = 0.07504916
Iteration 15, loss = 0.07514512
Iteration 16, loss = 0.07512429
Iteration 17, loss = 0.07506293
Iteration 18, loss = 0.07505275
Iteration 19, loss = 0.07496977
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(hidden_layer_sizes=(500, 100, 30), verbose=True)

- Verificar os resultados

In [53]:
print(classification_report(y_test.reset_index()['isFraud'], model_final.predict(pred_final_test)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    118992
           1       0.81      0.49      0.61      4364

    accuracy                           0.98    123356
   macro avg       0.90      0.74      0.80    123356
weighted avg       0.98      0.98      0.98    123356



- Verificar resultados dos modelos locais

In [54]:
print(classification_report(y_test, model_bank1.predict(X_test_1)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    118992
           1       0.88      0.37      0.52      4364

    accuracy                           0.98    123356
   macro avg       0.93      0.69      0.76    123356
weighted avg       0.97      0.98      0.97    123356



In [55]:
print(classification_report(y_test, model_bank2.predict(X_test_2)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    118992
           1       0.89      0.31      0.46      4364

    accuracy                           0.97    123356
   macro avg       0.93      0.65      0.72    123356
weighted avg       0.97      0.97      0.97    123356



<p style="color:blue">Aprendizado Federado Vertical - colunas separadas por Mutual Information</p>

Treinar o modelo separadamente e enviar os resultados para um parte central, que ira consolidar os resultados e treinar um novo modelo com essas informacoes. Contudo, as colunas serao ordenadas de acordo com o valor do mutual information

In [56]:
colunas = {}
for i, col in enumerate(mi_columns):
    colunas[col] = mi[i]

del colunas['isFraud']

In [57]:
colunas_sort = sorted(colunas.items(), key=operator.itemgetter(1), reverse=True)

In [58]:
cols_1 = []
cols_2 = []

for i in range(len(colunas_sort)):
    if i % 2 == 0:
        cols_1.append(colunas_sort[i][0])
    else:
        cols_2.append(colunas_sort[i][0])

Separar colunas

In [59]:
X_train_1 = X_train[cols_1]
X_train_2 = X_train[cols_2]

X_test_1 = X_test[cols_1]
X_test_2 = X_test[cols_2]

- Treinar os dois modelos localmente

In [60]:
model_bank1_1 = LGBMClassifier(random_state=42)
model_bank2_1 = LGBMClassifier(random_state=42)

In [61]:
model_bank1_1.fit(X_train_1, y_train)
model_bank2_1.fit(X_train_2, y_train)

LGBMClassifier(random_state=42)

- Preparar as bases para envio 

In [62]:
pred1 = model_bank1_1.predict_proba(X_train_1)
pred2 = model_bank2_1.predict_proba(X_train_2)

pred1_test = model_bank1_1.predict_proba(X_test_1)
pred2_test = model_bank2_1.predict_proba(X_test_2)

In [63]:
pred_final = []

for i in range(len(X_train_1)):
    pred_final.append(np.hstack((pred1[i], pred2[i])))

In [64]:
pred_final_test = []

for i in range(len(X_test_1)):
    pred_final_test.append(np.hstack((pred1_test[i], pred2_test[i])))

- Treino do novo modelo no servidor central

In [65]:
model_final = MLPClassifier(
    hidden_layer_sizes=(500, 100, 30), learning_rate_init=0.001, verbose=True
)

In [66]:
model_final.fit(pred_final, y_train.reset_index()['isFraud'])

Iteration 1, loss = 0.08648239
Iteration 2, loss = 0.07580718
Iteration 3, loss = 0.07554184
Iteration 4, loss = 0.07542203
Iteration 5, loss = 0.07513672
Iteration 6, loss = 0.07504209
Iteration 7, loss = 0.07502907
Iteration 8, loss = 0.07496239
Iteration 9, loss = 0.07494645
Iteration 10, loss = 0.07490171
Iteration 11, loss = 0.07486076
Iteration 12, loss = 0.07478729
Iteration 13, loss = 0.07473195
Iteration 14, loss = 0.07473886
Iteration 15, loss = 0.07475297
Iteration 16, loss = 0.07469417
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(hidden_layer_sizes=(500, 100, 30), verbose=True)

- Verificar os resultados

In [67]:
print(classification_report(y_test.reset_index()['isFraud'], model_final.predict(pred_final_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99    118992
           1       0.78      0.50      0.61      4364

    accuracy                           0.98    123356
   macro avg       0.88      0.75      0.80    123356
weighted avg       0.97      0.98      0.98    123356



- Verificar resultados dos modelos locais

In [68]:
print(classification_report(y_test, model_bank1_1.predict(X_test_1)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    118992
           1       0.88      0.37      0.52      4364

    accuracy                           0.98    123356
   macro avg       0.93      0.68      0.75    123356
weighted avg       0.97      0.98      0.97    123356



In [69]:
print(classification_report(y_test, model_bank2_1.predict(X_test_2)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    118992
           1       0.88      0.40      0.55      4364

    accuracy                           0.98    123356
   macro avg       0.93      0.70      0.77    123356
weighted avg       0.98      0.98      0.97    123356



<p style="color:red">Aprendizado Federado Horizontal - linhas separadas sem criterio</p>

Treinar o modelo separadamente e enviar os resultados para um parte central, que ira consolidar os resultados e treinar um novo modelo com essas informacoes

- Separar as bases

In [70]:
bank1_clients_index_train = X_train.sample(frac=0.7, random_state=42).index
bank2_clients_index_train = X_train.index.difference(bank1_clients_index_train)

y_bank1_train = y_train.loc[bank1_clients_index_train]
y_bank2_train = y_train.loc[bank2_clients_index_train]

bank1_clients_index_test = X_test.sample(frac=0.7, random_state=42).index
bank2_clients_index_test = X_test.index.difference(bank1_clients_index_test)
y_bank1_test = y_test.loc[bank1_clients_index_test]
y_bank2_test = y_test.loc[bank2_clients_index_test]

In [71]:
X_train_1 = X_train.loc[bank1_clients_index_train]
X_train_2 = X_train.loc[bank2_clients_index_train]

X_test_1 = X_test.loc[bank1_clients_index_test]
X_test_2 = X_test.loc[bank2_clients_index_test]

- Treinar modelo localmente

In [72]:
model_bank1 = LGBMClassifier(random_state=42)
model_bank2 = LGBMClassifier(random_state=42)

In [73]:
model_bank1.fit(X_train_1, y_bank1_train)
model_bank2.fit(X_train_2, y_bank2_train)

LGBMClassifier(random_state=42)

- Preparar as bases para envio 

In [74]:
pred1 = model_bank1.predict_proba(X_train_1)
pred2 = model_bank2.predict_proba(X_train_2)

pred1_test = model_bank1.predict_proba(X_test_1)
pred2_test = model_bank2.predict_proba(X_test_2)

In [75]:
pred_final = np.append(pred1, pred2, axis=0)
y_train_f = np.append(y_bank1_train, y_bank2_train, axis=0)

In [76]:
pred_final_test = np.append(pred1_test, pred2_test, axis=0)
y_test_f = np.append(y_bank1_test, y_bank2_test, axis=0)

- Treino do novo modelo no servidor central

In [77]:
model_final = MLPClassifier(
    hidden_layer_sizes=(500, 100, 30), learning_rate_init=0.001, verbose=True
)

In [78]:
model_final.fit(pred_final, y_train_f)

Iteration 1, loss = 0.07460784
Iteration 2, loss = 0.06835254
Iteration 3, loss = 0.06804165
Iteration 4, loss = 0.06798858
Iteration 5, loss = 0.06777473
Iteration 6, loss = 0.06774203
Iteration 7, loss = 0.06757376
Iteration 8, loss = 0.06763026
Iteration 9, loss = 0.06755479
Iteration 10, loss = 0.06751219
Iteration 11, loss = 0.06740739
Iteration 12, loss = 0.06743029
Iteration 13, loss = 0.06738449
Iteration 14, loss = 0.06733324
Iteration 15, loss = 0.06734228
Iteration 16, loss = 0.06728537
Iteration 17, loss = 0.06724223
Iteration 18, loss = 0.06726593
Iteration 19, loss = 0.06719835
Iteration 20, loss = 0.06726547
Iteration 21, loss = 0.06722433
Iteration 22, loss = 0.06724271
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(hidden_layer_sizes=(500, 100, 30), verbose=True)

- Verificar os resultados

In [79]:
print(classification_report(y_test_f, model_final.predict(pred_final_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99    118992
           1       0.77      0.50      0.61      4364

    accuracy                           0.98    123356
   macro avg       0.87      0.75      0.80    123356
weighted avg       0.97      0.98      0.97    123356



- Verificar resultados dos modelos locais

In [80]:
print(classification_report(y_bank1_test, model_bank1.predict(X_test_1)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     83256
           1       0.87      0.42      0.57      3093

    accuracy                           0.98     86349
   macro avg       0.92      0.71      0.78     86349
weighted avg       0.98      0.98      0.97     86349



In [81]:
print(classification_report(y_bank2_test, model_bank2.predict(X_test_2)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     35736
           1       0.88      0.39      0.54      1271

    accuracy                           0.98     37007
   macro avg       0.93      0.69      0.76     37007
weighted avg       0.98      0.98      0.97     37007



<p style="color:red">Aprendizado Federado Horizontal - linhas separadas por cluster</p>

Treinar o modelo separadamente e enviar os resultados para um parte central, que ira consolidar os resultados e treinar um novo modelo com essas informacoes

In [82]:
pca = PCA(n_components=50)

In [83]:
pca.fit(X_train)

PCA(n_components=50)

In [84]:
X_train_aux = pca.transform(X_train)
X_test_aux = pca.transform(X_test)

In [85]:
cluster = KMeans(n_clusters=3)

In [86]:
X_train_aux = np.c_[X_train_aux, cluster.fit_predict(X_train_aux)]
X_test_aux = np.c_[X_test_aux, cluster.predict(X_test_aux)]

In [87]:
df_train = pd.DataFrame(X_train_aux, index=X_train.index)
df_test = pd.DataFrame(X_test_aux, index=X_test.index)

In [88]:
X_train_1 = df_train[df_train[50] == 0]
X_train_2 = df_train[df_train[50] == 1]
X_train_3 = df_train[df_train[50] == 2]

X_test_1 = df_test[df_test[50] == 0]
X_test_2 = df_test[df_test[50] == 1]
X_test_3 = df_test[df_test[50] == 2]

In [89]:
y_bank1_train = y_train.loc[X_train_1.index]
y_bank2_train = y_train.loc[X_train_2.index]
y_bank3_train = y_train.loc[X_train_3.index]

y_bank1_test = y_test.loc[X_test_1.index]
y_bank2_test = y_test.loc[X_test_2.index]
y_bank3_test = y_test.loc[X_test_3.index]

- Treinar modelo localmente

In [90]:
model_bank1 = LGBMClassifier(random_state=42)
model_bank2 = LGBMClassifier(random_state=42)
model_bank3 = LGBMClassifier(random_state=42)

In [91]:
model_bank1.fit(X_train_1, y_bank1_train)
model_bank2.fit(X_train_2, y_bank2_train)
model_bank3.fit(X_train_3, y_bank3_train)

LGBMClassifier(random_state=42)

- Preparar as bases para envio 

In [92]:
pred1 = model_bank1.predict_proba(X_train_1)
pred2 = model_bank2.predict_proba(X_train_2)
pred3 = model_bank2.predict_proba(X_train_3)

pred1_test = model_bank1.predict_proba(X_test_1)
pred2_test = model_bank2.predict_proba(X_test_2)
pred3_test = model_bank2.predict_proba(X_test_3)

In [93]:
pred_final = np.append(pred1, pred2, axis=0)
pred_final = np.append(pred_final, pred3, axis=0)

y_train_f = np.append(y_bank1_train, y_bank2_train, axis=0)
y_train_f = np.append(y_train_f, y_bank3_train, axis=0)

In [94]:
pred_final_test = np.append(pred1_test, pred2_test, axis=0)
pred_final_test = np.append(pred_final_test, pred3_test, axis=0)

y_test_f = np.append(y_bank1_test, y_bank2_test, axis=0)
y_test_f = np.append(y_test_f, y_bank3_test, axis=0)

- Treino do novo modelo no servidor central

In [95]:
model_final = MLPClassifier(
    hidden_layer_sizes=(500, 100, 30), learning_rate_init=0.001, verbose=True
)

In [96]:
model_final.fit(pred_final, y_train_f)

Iteration 1, loss = 0.09583766
Iteration 2, loss = 0.09079454
Iteration 3, loss = 0.09063121
Iteration 4, loss = 0.09038221
Iteration 5, loss = 0.09025831
Iteration 6, loss = 0.09005750
Iteration 7, loss = 0.08994660
Iteration 8, loss = 0.08986661
Iteration 9, loss = 0.08982562
Iteration 10, loss = 0.08975864
Iteration 11, loss = 0.08971135
Iteration 12, loss = 0.08979448
Iteration 13, loss = 0.08959808
Iteration 14, loss = 0.08956733
Iteration 15, loss = 0.08967196
Iteration 16, loss = 0.08957351
Iteration 17, loss = 0.08954935
Iteration 18, loss = 0.08955734
Iteration 19, loss = 0.08956709
Iteration 20, loss = 0.08955112
Iteration 21, loss = 0.08943137
Iteration 22, loss = 0.08948273
Iteration 23, loss = 0.08946377
Iteration 24, loss = 0.08945785
Iteration 25, loss = 0.08942979
Iteration 26, loss = 0.08942277
Iteration 27, loss = 0.08944788
Iteration 28, loss = 0.08943804
Iteration 29, loss = 0.08944206
Iteration 30, loss = 0.08947941
Iteration 31, loss = 0.08935305
Iteration 32, los

MLPClassifier(hidden_layer_sizes=(500, 100, 30), verbose=True)

- Verificar os resultados

In [97]:
print(classification_report(y_test_f, model_final.predict(pred_final_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98    118992
           1       0.61      0.31      0.41      4364

    accuracy                           0.97    123356
   macro avg       0.79      0.65      0.70    123356
weighted avg       0.96      0.97      0.96    123356



- Verificar resultados dos modelos locais

In [98]:
print(classification_report(y_bank1_test, model_bank1.predict(X_test_1)))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     11771
           1       0.85      0.58      0.69      1566

    accuracy                           0.94     13337
   macro avg       0.90      0.78      0.83     13337
weighted avg       0.93      0.94      0.93     13337



In [99]:
print(classification_report(y_bank2_test, model_bank2.predict(X_test_2)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     88590
           1       0.80      0.09      0.16      1908

    accuracy                           0.98     90498
   macro avg       0.89      0.55      0.58     90498
weighted avg       0.98      0.98      0.97     90498



In [100]:
print(classification_report(y_bank3_test, model_bank3.predict(X_test_3)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18631
           1       0.85      0.37      0.52       890

    accuracy                           0.97     19521
   macro avg       0.91      0.68      0.75     19521
weighted avg       0.97      0.97      0.96     19521

