In [1]:
pip install mglearn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
%matplotlib inline
from IPython.display import display
import matplotlib_inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mglearn
from cycler import cycler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import *
import os

# Libreria per importare CSV da Drive
# from google.colab import drive

matplotlib_inline.backend_inline.set_matplotlib_formats('pdf', 'png')
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['image.cmap'] = "viridis"
plt.rcParams['image.interpolation'] = "none"
plt.rcParams['savefig.bbox'] = "tight"
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['legend.numpoints'] = 1
plt.rc('axes', prop_cycle=(
    cycler('color', mglearn.plot_helpers.cm_cycle.colors) +
    cycler('linestyle', ['-', '-', "--", (0, (3, 3)), (0, (1.5, 1.5))])))

np.set_printoptions(precision=3, suppress=True)

pd.set_option("display.max_columns", 16)
pd.set_option('display.precision', 2)

__all__ = ['np', 'mglearn', 'display', 'plt', 'pd']


In [3]:
from sklearn.preprocessing import OneHotEncoder

def ohe(df, list_of_columns):
    # Inizializzare OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    
    # Manteniamo una copia del DataFrame originale
    df_encoded = df.copy()

    # Iterare sulle colonne da codificare
    for col in list_of_columns:
        # Applicare OneHotEncoder alla colonna specifica
        encoded = encoder.fit_transform(df[[col]])

        # Creare un DataFrame con le nuove colonne codificate, usando gli stessi indici del DataFrame originale
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([col]), index=df.index)

        # Concatenare le nuove colonne codificate al DataFrame originale
        df_encoded = pd.concat([df_encoded, encoded_df], axis=1)
    df_encoded = df_encoded.drop(list_of_columns, axis=1)

    # Restituire il DataFrame finale, mantenendo le colonne originali
    return df_encoded

In [4]:
def train(df, model):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuratezza: {accuracy}')

    # Calcolare la precisione
    precision = precision_score(y_test, y_pred)
    print(f'Precisione: {precision}')

    # Calcolare il recall (sensibilità)
    recall = recall_score(y_test, y_pred)
    print(f'Recall: {recall}')

    # Calcolare FPR (False Positive Ration)
    print(f'FPR: {1 - recall}')

    # Matrice di confusione
    cm = confusion_matrix(y_test, y_pred)
    print(f'Matrice di confusione:\n{cm}')
    return model

In [5]:
def test(df, model):
    
    df = ohe(df,['employment_status','payment_type','housing_status', 'source', 'device_os'])
    y_pred = model.predict(df.drop(['fraud_bool'], axis=1))
    y_test = df['fraud_bool']

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuratezza: {accuracy}')

    # Calcolare la precisione
    precision = precision_score(y_test, y_pred)
    print(f'Precisione: {precision}')

    # Calcolare il recall (sensibilità)
    recall = recall_score(y_test, y_pred)
    print(f'Recall: {recall}')

    # Calcolare FPR (False Positive Ration)
    print(f'FPR: {1 - recall}')

    # Matrice di confusione
    cm = confusion_matrix(y_test, y_pred)
    print(f'Matrice di confusione:\n{cm}')

    print('\n__________\n')

In [29]:

#Recall = TN / (TN + FP)
#FPR = FP / = 1 - Recall

0.849744711889132

# (75-25 con rapporto 1 a 50)

In [6]:
df = pd.read_csv("Base.csv").drop(['device_fraud_count', 'month'], axis=1)

In [7]:
df_frodi = df[(df['fraud_bool'] == 1)]
n = df[(df['fraud_bool'] == 1)].shape[0]
print(n)
df_not_frodi = df[(df['fraud_bool'] != 1)].head(n*50)

df = pd.concat([df_frodi, df_not_frodi])

from sklearn.model_selection import train_test_split
df = ohe(df,['employment_status','payment_type','housing_status', 'source', 'device_os'])
X_train, X_test, y_train, y_test = train_test_split(df.drop(['fraud_bool'], axis=1), df['fraud_bool'], random_state=0)

11029


## Train RandomForestClassifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

model = train(df, model)

Accuratezza: 0.9829967287725786
Precisione: 0.8268156424581006
Recall: 0.16192560175054704
FPR: 0.838074398249453
Matrice di confusione:
[[137785     93]
 [  2298    444]]


# Cross Validation

In [26]:
df = pd.read_csv("Base.csv").drop(['device_fraud_count', 'month'], axis=1)

df_frodi = df[(df['fraud_bool'] == 1)]
n = df[(df['fraud_bool'] == 1)].shape[0]
print(n)
df_not_frodi = df[(df['fraud_bool'] != 1)].head(n*50)

df = pd.concat([df_frodi, df_not_frodi])

df = ohe(df,['employment_status','payment_type','housing_status', 'source', 'device_os'])

k_folds = KFold(n_splits = 5)

scores = cross_val_score(model, df.drop('fraud_bool', axis=1), df['fraud_bool'])
print("Cross-validation scores: {}".format(scores))

11029
Cross-validation scores: [0.98  0.98  0.981 0.982 0.982]


# Information gain

In [30]:
from sklearn.feature_selection import mutual_info_classif

# Calculate Information Gain using mutual_info_classif
info_gain = mutual_info_classif(df.drop('fraud_bool', axis=1), df['fraud_bool'])
print("Information Gain for each feature:", info_gain)

Information Gain for each feature: [0.021 0.002 0.005 0.004 0.032 0.001 0.002 0.    0.002 0.006 0.013 0.002
 0.01  0.006 0.097 0.054 0.103 0.009 0.017 0.034 0.001 0.    0.103 0.132
 0.114 0.01  0.001 0.001 0.    0.002 0.    0.03  0.041 0.023 0.006 0.
 0.017 0.024 0.058 0.001 0.012 0.    0.    0.119 0.    0.048 0.002 0.039
 0.029 0.   ]


In [34]:
print("Information Gain for each feature:", info_gain)
print(df.drop('fraud_bool', axis=1).columns)

Information Gain for each feature: [0.021 0.002 0.005 0.004 0.032 0.001 0.002 0.    0.002 0.006 0.013 0.002
 0.01  0.006 0.097 0.054 0.103 0.009 0.017 0.034 0.001 0.    0.103 0.132
 0.114 0.01  0.001 0.001 0.    0.002 0.    0.03  0.041 0.023 0.006 0.
 0.017 0.024 0.058 0.001 0.012 0.    0.    0.119 0.    0.048 0.002 0.039
 0.029 0.   ]
Index(['income', 'name_email_similarity', 'prev_address_months_count',
       'current_address_months_count', 'customer_age', 'days_since_request',
       'intended_balcon_amount', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'credit_risk_score',
       'email_is_free', 'phone_home_valid', 'phone_mobile_valid',
       'bank_months_count', 'has_other_cards', 'proposed_credit_limit',
       'foreign_request', 'session_length_in_minutes', 'keep_alive_session',
       'device_distinct_emails_8w', 'employment_status_CA',
       'employment_status_CB', 'employment_status

Le feature con più information gain sono:
- device_distinct_emails_8w (0.132)
- source_INTERNET (0.119)
- employment_status_CA (0.114)
- phone_mobile_valid (0.103)
- keep_alive_session (0.103)
- email_is_free (0.097)
- housing_status_BC (0.058)
- phone_home_valid (0.054)
- device_os_linux (0.048)

Punteggi totali:

0.    : 'employment_status_CE'
0.    : 'employment_status_CG'      
0.    : 'housing_status_BF'
0.    : 'housing_status_BG'   
0.    : 'session_length_in_minutes'
0.    : 'source_TELEAPP'
0.    : 'zip_count_4w'
0.: 'device_os_x11'
0.: 'payment_type_AE'
0.001 : 'days_since_request'    
0.001 : 'employment_status_CC'
0.001 : 'employment_status_CD'
0.001 : 'foreign_request'
0.001 : 'housing_status_BD'   
0.002 : 'device_os_macintosh'
0.002 : 'employment_status_CF'
0.002 : 'intended_balcon_amount'
0.002 : 'name_email_similarity'
0.002 : 'velocity_6h'
0.002: 'bank_branch_count_8w'   
0.004 : 'current_address_months_count'
0.005 : 'prev_address_months_count'
0.006 : 'credit_risk_score'  
0.006 : 'payment_type_AD'
0.006 : 'velocity_24h'    
0.009 : 'bank_months_count'
0.01  : 'date_of_birth_distinct_emails_4w'
0.01  : 'employment_status_CB'
0.012 : 'housing_status_BE'
0.013 : 'velocity_4w'
0.017 : 'has_other_cards'
0.017 : 'housing_status_BA'    
0.021 : 'income'
0.023 : 'payment_type_AC'      
0.024 : 'housing_status_BB'
0.029 : 'device_os_windows'   
0.03  : 'payment_type_AA'
0.032 : 'customer_age'
0.034 : 'proposed_credit_limit'
0.039: 'device_os_other'
0.041 : 'payment_type_AB'
0.048 : 'device_os_linux'      
0.054 : 'phone_home_valid'
0.058 : 'housing_status_BC'
0.097 : 'email_is_free'
0.103 : 'keep_alive_session'    
0.103 : 'phone_mobile_valid'  
0.114 : 'employment_status_CA'
0.119 : 'source_INTERNET'
0.132: 'device_distinct_emails_8w'