In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import dask
import dask.dataframe as dd
from dask import delayed, compute

import os 

from sklearn.impute import SimpleImputer


pd.options.display.max_columns = None

In [2]:
transactions_sample = pd.read_csv('data/Credit Cards Transactions/credit_card_transactions-ibm_v2.csv', nrows= 500_000)
card_info_sample = pd.read_csv('/home/eacs/Documents/Diplomado DS/Modulo 2/Examen Final/data/Credit Cards Transactions/sd254_cards.csv')
user_info_sample = pd.read_csv('/home/eacs/Documents/Diplomado DS/Modulo 2/Examen Final/data/Credit Cards Transactions/sd254_users.csv')

In [3]:
transactions_sample.head(1)

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No


# Limpieza de la información
Primero limpiaremos transactions con el uso de la muestra para poder tener una idea de que es lo que debería tener cada columna

In [4]:
transactions_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   User            500000 non-null  int64  
 1   Card            500000 non-null  int64  
 2   Year            500000 non-null  int64  
 3   Month           500000 non-null  int64  
 4   Day             500000 non-null  int64  
 5   Time            500000 non-null  object 
 6   Amount          500000 non-null  object 
 7   Use Chip        500000 non-null  object 
 8   Merchant Name   500000 non-null  int64  
 9   Merchant City   500000 non-null  object 
 10  Merchant State  432257 non-null  object 
 11  Zip             428629 non-null  float64
 12  MCC             500000 non-null  int64  
 13  Errors?         8864 non-null    object 
 14  Is Fraud?       500000 non-null  object 
dtypes: float64(1), int64(7), object(7)
memory usage: 57.2+ MB


In [5]:
# Si no tiene estas columnas no sería posible hacer un análisis de la información por lo que si no las tienen las eliminaremos
transactions_sample.dropna(subset= ['User', 'Card', 'Month', 'Day', 'Is Fraud?'], axis= 0, inplace= True)
# Amount pasamos a numérico
transactions_sample['Amount'] = transactions_sample['Amount'].replace({'\$':''}, regex=True).astype('float')
# Por la información sabemos que si Errors es NaN entonces no se tiene que borrar porque significa que no hay error
transactions_sample['Errors?'].fillna('0', inplace= True)
# En la localización de la venta los valores de NaN puede que signifiquen que no esté correctamente registrado pero eso puede ser información que se puede usar
transactions_sample['Zip'] = transactions_sample['Zip'].astype('object')
transactions_sample[['Merchant Name', 'Merchant City', 'Merchant State', 'Zip']] = transactions_sample[['Merchant Name', 'Merchant City',
                                                                                                        'Merchant State', 'Zip']].replace({'NaN':'No registrado',
                                                                                                                                            np.nan : 'No registrado'})
# Pasamos los valores de Tiempo para poder hacer agrupaciones con ellos
transactions_sample['Hour'] = pd.DatetimeIndex(transactions_sample['Time']).hour
transactions_sample.drop('Time', axis= 1, inplace= True)

transactions_sample['Fraud'] = transactions_sample['Is Fraud?'].replace({'No': 0, 'Yes':1})
transactions_sample.drop('Is Fraud?', axis= 1, inplace= True)


In [6]:
from sklearn.preprocessing import LabelEncoder
cols_to_label = ['Use Chip', 'Merchant Name','Errors?']
l_enc = LabelEncoder()

for col in cols_to_label:
    transactions_sample[col] = l_enc.fit_transform(transactions_sample[col])

Necesitamos datos de la tarjeta

In [7]:
card_info_sample.columns

Index(['User', 'CARD INDEX', 'Card Brand', 'Card Type', 'Card Number',
       'Expires', 'CVV', 'Has Chip', 'Cards Issued', 'Credit Limit',
       'Acct Open Date', 'Year PIN last Changed', 'Card on Dark Web'],
      dtype='object')

In [8]:
cols_to_label = ['Card Brand', 'Card Type', 'Has Chip', 'Cards Issued', 'Year PIN last Changed', 'Card on Dark Web']
l_enc = LabelEncoder()

for col in cols_to_label:
    card_info_sample[col] = l_enc.fit_transform(card_info_sample[col])

In [9]:
card_info_sample['Expires'] = pd.to_datetime(card_info_sample['Expires'], format= '%m/%Y')
card_info_sample['Acct Open Date'] = pd.to_datetime(card_info_sample['Acct Open Date'], format= '%m/%Y')
card_info_sample['days_until_expire'] = (card_info_sample['Expires'] - card_info_sample['Acct Open Date']).dt.days

In [10]:
card_info_sample['Credit Limit'] = card_info_sample['Credit Limit'].replace({'\$':''}, regex=True).astype('float')

In [11]:
transactions_sample = transactions_sample.merge(card_info_sample, how= 'inner', left_on= ['User', 'Card'], right_on= ['User', 'CARD INDEX'])

Porcentaje del gasto comparado con su línea de crédito

In [12]:
transactions_sample['pct_Limit_amount'] = transactions_sample['Amount'] / transactions_sample['Credit Limit']

Gastos promedio para cada tipo de MCC mensual por cada usuario

In [13]:
count_mcc = transactions_sample.pivot_table(index= ['User', 'Card', 'Year', 'Month'], columns= 'MCC', values= 'Amount', aggfunc= 'mean').fillna(0)
new_columns = {col: f'MCC_{col}' for col in count_mcc.columns}
count_mcc = count_mcc.rename(columns= new_columns)
count_mcc.reset_index(inplace= True)
transactions_sample = transactions_sample.merge(count_mcc, on=['User', 'Card', 'Year', 'Month'], how= 'inner')

In [14]:
count_state = transactions_sample.pivot_table(index= ['User', 'Card', 'Year', 'Month'], columns= 'Merchant State', values= 'Amount', aggfunc= 'sum').fillna(0)
new_columns = {col: f'Merch_State_{col}' for col in count_state.columns}
count_state = count_state.rename(columns= new_columns)
count_state.reset_index(inplace= True)
transactions_sample = transactions_sample.merge(count_state, on=['User', 'Card', 'Year', 'Month'], how= 'inner')

Ahora los datos que se necesitan de cada usuario

In [15]:
user_info_sample.columns

Index(['Person', 'Current Age', 'Retirement Age', 'Birth Year', 'Birth Month',
       'Gender', 'Address', 'Apartment', 'City', 'State', 'Zipcode',
       'Latitude', 'Longitude', 'Per Capita Income - Zipcode',
       'Yearly Income - Person', 'Total Debt', 'FICO Score',
       'Num Credit Cards'],
      dtype='object')

In [16]:
cols_to_label = ['Gender']
l_enc = LabelEncoder()

for col in cols_to_label:
    user_info_sample[col] = l_enc.fit_transform(user_info_sample[col])

In [17]:
user_info_sample.dropna(subset= ['Person', 'City', 'State', 'Zipcode'], inplace= True, axis= 0)
user_info_sample.fillna('NoRecod', inplace= True)

user_info_sample['Per Capita Income - Zipcode'] = user_info_sample['Per Capita Income - Zipcode'].replace({'\$':''}, regex=True).astype('float')
user_info_sample['Yearly Income - Person'] = user_info_sample['Yearly Income - Person'].replace({'\$':''}, regex=True).astype('float')
user_info_sample['Total Debt'] = user_info_sample['Total Debt'].replace({'\$':''}, regex=True).astype('float')

In [18]:
transactions_sample = transactions_sample.merge(user_info_sample, left_on= 'User', right_index= True, how= 'inner')

In [19]:
id_cols = ['User', 'Card', 'Year', 'Month', 'Day', 'Amount']

mc_cols = [x for x in transactions_sample.columns if x[0:4]== 'MCC_']
merch_cols = [x for x in transactions_sample.columns if x[0:6]== 'Merch_']

cat_cols = ['Use Chip', 'Merchant Name', 'Errors?','Card Brand',
 'Card Type','Has Chip','Year PIN last Changed', 'Gender',]
cont_cols = ['Current Age', 'Retirement Age', 'Per Capita Income - Zipcode', 'Yearly Income - Person', 'Total Debt','FICO Score', 'Num Credit Cards']

created_num = ['days_until_expire', 'pct_Limit_amount', 'InState', 'InCity']

In [20]:
transactions_sample['InState'] = transactions_sample.apply(lambda x: 1 if x['State'] == x['Merchant State'] else 0, axis= 1)
transactions_sample['InCity'] = transactions_sample.apply(lambda x: 1 if x['City'] == x['Merchant City'] else 0, axis= 1)

In [21]:
transactions_sample = transactions_sample[id_cols + mc_cols + merch_cols + cat_cols + cont_cols + created_num + ['Fraud']]

# Primeras pruebas de modelo

In [22]:
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import train_test_split

X = transactions_sample[id_cols + mc_cols + merch_cols + cat_cols + cont_cols + created_num]
y = transactions_sample['Fraud']

tl = TomekLinks(sampling_strategy= 'auto')

Xt, Xv, yt, yv = train_test_split(X,y, train_size = 0.7)
Xt, yt = tl.fit_resample(Xt,yt)

In [23]:
from xgboost import XGBClassifier
xgclas = XGBClassifier()

In [24]:
xgclas.fit(Xt, yt)
predicted = xgclas.predict(Xv)

In [25]:
from sklearn.metrics import classification_report

print(classification_report(y_true = yv, y_pred= predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    149859
           1       0.81      0.71      0.75       141

    accuracy                           1.00    150000
   macro avg       0.90      0.85      0.88    150000
weighted avg       1.00      1.00      1.00    150000



In [None]:
    print(classification_report(yt, xgclas.predict(Xt)))

# Ejecución distribuida
Para poder ejecutar en toda la información usaremos Dask para poder distribuir los procesos y no muera el kernel

In [None]:
transactions = dd.read_csv('/home/eacs/Documents/Diplomado DS/Modulo 2/Examen Final/data/Credit Cards Transactions/credit_card_transactions-ibm_v2.csv', dtype={'Errors?': 'object'})

user_info = dd.read_csv('/home/eacs/Documents/Diplomado DS/Modulo 2/Examen Final/data/Credit Cards Transactions/sd254_users.csv')
card_info = dd.read_csv('/home/eacs/Documents/Diplomado DS/Modulo 2/Examen Final/data/Credit Cards Transactions/sd254_cards.csv')

In [None]:
union = dd.merge(transactions, user_info, left_on= 'User',right_index= True, how= 'left')
union = dd.merge(union, card_info, left_on= 'User',right_on= 'User', how= 'left')
