In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import dask
import dask.dataframe as dd
from dask import delayed, compute

import os 

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score

from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier

import warnings

pd.options.display.max_columns = None
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
dataPath = 'data/'
files = [file for file in os.listdir(dataPath)]
files

['credit_card_transactions-ibm_v2.csv', 'sd254_users.csv', 'sd254_cards.csv']

In [3]:
transactions = dd.read_csv(dataPath + 'credit_card_transactions-ibm_v2.csv', dtype={'Errors?': 'object'})
user_info = dd.read_csv(dataPath + 'sd254_users.csv')
card_info = dd.read_csv(dataPath + 'sd254_cards.csv')

In [4]:
transactions["Date"] = dd.to_datetime(transactions[["Year", "Month", 'Day']])
transactions = transactions[transactions.Date >= '2019-01-01']

In [5]:
union = dd.merge(transactions, user_info, left_on= 'User',right_index= True, how= 'left')
union = dd.merge(union, card_info, left_on= 'User',right_on= 'User', how= 'left')

In [6]:
# Si no tiene estas columnas no sería posible hacer un análisis de la información
union = union.dropna(subset= [
    'User', 'Card', 'Month', 'Day', 'Is Fraud?', # De Transactions
    'Person', 'City', 'State', 'Zipcode', # De user_info
    'Card Brand', 'Card Type', 'Has Chip', 'Cards Issued', 'Credit Limit', 'Expires', 'Acct Open Date' # De union
    ])

# Amount pasamos a numérico
union = union.assign(
    Amount=union['Amount'].str.replace('$', '').astype('float64'),
    Credit_Limit = union['Credit Limit'].str.replace('$', '').astype('float64'),
    Total_Debt = union['Total Debt'].str.replace('$', '').astype('float64'),
    Per_Capita_Income_Zipcode = union['Per Capita Income - Zipcode'].str.replace('$', '').astype('float64'),
    Yearly_Income_Person = union['Yearly Income - Person'].str.replace('$', '').astype('float64')
)


union = union.drop(
    ['Credit Limit', 'Total Debt', 'Per Capita Income - Zipcode', 'Yearly Income - Person'],
    axis = 1)


# Formato de fecha
union = union.assign(
    Acct_Open_Date = dd.to_datetime(union['Acct Open Date'], format= '%m/%Y'),
    Expires= dd.to_datetime(union['Expires'], format= '%m/%Y'),
    )
union = union.drop(
    ['Acct Open Date'],
    axis = 1)

# Cambio de formatos
union = union.assign(
    Zip = union['Zip'].astype('object'),

)


# Remplazar datos faltantes
union = union.assign(
    Merchant_Name = union['Merchant Name'].replace(np.nan, 'No registrado'),
    Merchant_City = union['Merchant City'].replace(np.nan, 'No registrado'),
    Merchant_State = union['Merchant State'].replace(np.nan, 'No registrado'),
    Zip = union['Zip'].replace(np.nan, 'No registrado')
)



union = union.assign(
    Errors= union['Errors?'].fillna('0'),
    Fraud= union['Is Fraud?'].map({'No': 0, 'Yes': 1})
)

union = union.drop(
    ['Errors?', 'Is Fraud?'], axis=1)



In [7]:
def calc_days_until_expire(df):
    df['days_until_expire'] = (df['Expires'] - df['Acct_Open_Date']).dt.days
    return df

union = union.map_partitions(calc_days_until_expire)

In [8]:
def calc_pct_amount_limit(df):
    df['pct_limit_amount'] = df['Amount'] / df['Credit_Limit']
    return df

union = union.map_partitions(calc_pct_amount_limit)

In [9]:
union['InState'] = union.apply(lambda x: 1 if x['State'] == x['Merchant State'] else 0, axis= 1, meta= (None, 'int64'))
union['InCity'] = union.apply(lambda x: 1 if x['City'] == x['Merchant City'] else 0, axis= 1, meta= (None, 'int64'))

In [10]:
id_cols = ['User', 'Card', 'Year', 'Month', 'Day', 'Amount']

cat_cols = ['Use Chip', 'Merchant Name', 'Merchant_City',
            'Merchant_State','Errors','Card Brand',
            'Card Type','Has Chip','Year PIN last Changed', 'Gender']


cont_cols = ['Current Age', 'Retirement Age', 'Per_Capita_Income_Zipcode',
            'Yearly_Income_Person', 'Total_Debt','FICO Score', 'Num Credit Cards']

created_num = ['days_until_expire', 'pct_limit_amount', 'InState', 'InCity']


columns_use = id_cols + cat_cols + cont_cols + created_num 
union = union[columns_use + ['Fraud']]

In [11]:
rforest = RandomForestClassifier()
for user in union.User.unique():
    user_df = union.loc[union.User == user].compute()
    print('1')
    X = user_df[columns_use]
    y = user_df.Fraud

    lab_enc = LabelEncoder()
    for col in cat_cols:
        X[col] = lab_enc.fit_transform(X[col])

    print('1')
    Xt, Xv, yt, yv = train_test_split(X, y, train_size= 0.6)
    rforest.fit(Xt, yt)
    y_pred = rforest.predict(Xv)
    print('Para el año: ', user)
    print(f'Presicion : {precision_score(yv, y_pred)}')
    print(f'Recall : {recall_score(yv, y_pred)}')
    print(f'F1 : {f1_score(yv, y_pred)}')
    

: 

: 