In [250]:
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import BayesianGaussianMixture

In [251]:
device_cod = {'DESCONOCIDO': 1.1, 'TABLETA': 1.2, 'MÓVIL': 1.3, 'CAJERO AUTOMÁTICO': 1.4, 'PAGO': 1.5, 'POS':1.6, 'DISPOSITIVO_SOSPECHOSO':1.7}

transaction_cod ={'PAGO':2.1, 'RETIRO':2.2, 'DEPÓSITO':2.3, 'TRANSFERENCIA':2.4, 'COMPRA':2.5}

loc_type_cod = {'NACIONAL': 1, 'INTERNACIONAL': 2}

In [252]:
file_path = pd.read_csv('C:/Users/allan/src/projects/grupal_project/anomaly_detection/datos_transaccion.csv')

data = file_path
data['transaction_type'] = data['transaction_type'].str.upper().map(transaction_cod)
data['location_type'] = data['location_type'].str.upper().map(loc_type_cod)
data['device_type'] = data['device_type'].str.upper().map(device_cod)

data.head()

Unnamed: 0,transaction_id,client_id,transaction_date,amount,transaction_type,location,device_type,is_fraud,location_type,currency
0,TRANS_1,CLIENTE_722,2020-10-09 03:34:03,279.44,2.2,"De (Guanacaste, Cañas) hacia (Puntarenas, Buen...",1.2,0,1,USD
1,TRANS_2,CLIENTE_1693,2020-02-15 17:39:36,572.66,2.2,"De (Alajuela, Palmares) hacia (Limón, Talamanca)",1.1,0,1,USD
2,TRANS_3,CLIENTE_1105,2020-03-25 12:01:28,16227.15,2.3,"De (Guanacaste, Nicoya) hacia (Alajuela, Alaju...",1.4,0,1,USD
3,TRANS_4,CLIENTE_439,2021-12-02 06:54:46,4665.99,2.4,"De (Guanacaste, Liberia) hacia (Puntarenas, Bu...",1.3,0,1,USD
4,TRANS_5,CLIENTE_2178,2020-03-16 01:33:17,192.46,2.1,"De (Puntarenas, Aguirre) hacia (Bulgaria, Sofía)",1.2,0,2,USD


In [253]:
data_var = data[['location_type', 'device_type', 'transaction_type', 'amount', 'transaction_date']].copy()
data_var.columns = ['loc_cod', 'device_cod', 'trans_cod', 'amount', 'transaction_date']

data_var['transaction_date'] = pd.to_datetime(data_var['transaction_date'])
data_var['transaction_date'] = data_var['transaction_date'].astype('int64') // 10**9

data_var.dropna(inplace=True)

data_var.head()

Unnamed: 0,loc_cod,device_cod,trans_cod,amount,transaction_date
0,1,1.2,2.2,279.44,1602214443
1,1,1.1,2.2,572.66,1581788376
2,1,1.4,2.3,16227.15,1585137688
3,1,1.3,2.4,4665.99,1638428086
4,2,1.2,2.1,192.46,1584322397


In [254]:
Q1 = data_var[['device_cod', 'trans_cod', 'amount', 'transaction_date']].quantile(0.30)
Q3 = data_var[['device_cod', 'trans_cod', 'amount', 'transaction_date']].quantile(0.70)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data_var['is_anomaly'] = (
    (data_var[['device_cod', 'trans_cod', 'amount', 'transaction_date']] < lower_bound) |
    (data_var[['device_cod', 'trans_cod', 'amount', 'transaction_date']] > upper_bound)
).any(axis=1).astype(int)  # 1 == anómalo, 0 == normal

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_var[['device_cod', 'trans_cod', 'amount', 'transaction_date']])

gmm = BayesianGaussianMixture(n_components=4, covariance_type='full', random_state=42, max_iter=200, 
                              tol=1e-4, init_params='random', weight_concentration_prior=1e-3)
gmm.fit(data_scaled)

data_var['anomaly_score'] = gmm.score_samples(data_scaled)

data_var.head(10)

Unnamed: 0,loc_cod,device_cod,trans_cod,amount,transaction_date,is_anomaly,anomaly_score
0,1,1.2,2.2,279.44,1602214443,0,-2.251179
1,1,1.1,2.2,572.66,1581788376,0,-3.047424
2,1,1.4,2.3,16227.15,1585137688,1,-5.790767
3,1,1.3,2.4,4665.99,1638428086,0,-0.948562
4,2,1.2,2.1,192.46,1584322397,0,-3.76877
5,1,1.7,2.2,620.99,1610830999,0,-3.063381
6,1,1.6,2.5,14299.57,1679311578,1,-7.979784
7,1,1.4,2.1,220.6,1647359982,0,-2.928998
8,1,1.3,2.5,26.26,1679042662,0,-3.324536
9,1,1.7,2.5,207.72,1658492109,0,-3.81304


In [255]:
gmm = BayesianGaussianMixture(n_components=4, covariance_type='full', random_state=42, max_iter=200, 
    tol=1e-4,  init_params='random',weight_concentration_prior=1e-3  
)

gmm.fit(data_scaled)

data_var['anomaly_score'] = gmm.score_samples(data_scaled)

threshold = data_var['anomaly_score'].quantile(0.25)  # 25% de percentil

data_var['is_anomaly'] = data_var['anomaly_score'] < threshold

data_var = data_var.drop(columns=['transaction_date'])

data_var.head(10)

Unnamed: 0,loc_cod,device_cod,trans_cod,amount,is_anomaly,anomaly_score
0,1,1.2,2.2,279.44,False,-2.251179
1,1,1.1,2.2,572.66,False,-3.047424
2,1,1.4,2.3,16227.15,True,-5.790767
3,1,1.3,2.4,4665.99,False,-0.948562
4,2,1.2,2.1,192.46,False,-3.76877
5,1,1.7,2.2,620.99,False,-3.063381
6,1,1.6,2.5,14299.57,True,-7.979784
7,1,1.4,2.1,220.6,False,-2.928998
8,1,1.3,2.5,26.26,False,-3.324536
9,1,1.7,2.5,207.72,False,-3.81304


In [256]:
file_path.head(10)

Unnamed: 0,transaction_id,client_id,transaction_date,amount,transaction_type,location,device_type,is_fraud,location_type,currency
0,TRANS_1,CLIENTE_722,2020-10-09 03:34:03,279.44,2.2,"De (Guanacaste, Cañas) hacia (Puntarenas, Buen...",1.2,0,1,USD
1,TRANS_2,CLIENTE_1693,2020-02-15 17:39:36,572.66,2.2,"De (Alajuela, Palmares) hacia (Limón, Talamanca)",1.1,0,1,USD
2,TRANS_3,CLIENTE_1105,2020-03-25 12:01:28,16227.15,2.3,"De (Guanacaste, Nicoya) hacia (Alajuela, Alaju...",1.4,0,1,USD
3,TRANS_4,CLIENTE_439,2021-12-02 06:54:46,4665.99,2.4,"De (Guanacaste, Liberia) hacia (Puntarenas, Bu...",1.3,0,1,USD
4,TRANS_5,CLIENTE_2178,2020-03-16 01:33:17,192.46,2.1,"De (Puntarenas, Aguirre) hacia (Bulgaria, Sofía)",1.2,0,2,USD
5,TRANS_6,CLIENTE_1868,2021-01-16 21:03:19,620.99,2.2,"De (Cartago, Turrialba) hacia (Cartago, Alvarado)",1.7,0,1,USD
6,TRANS_7,CLIENTE_2124,2023-03-20 11:26:18,14299.57,2.5,"De (Alajuela, Palmares) hacia (Guanacaste, Cañas)",1.6,1,1,USD
7,TRANS_8,CLIENTE_2162,2022-03-15 15:59:42,220.6,2.1,"De (Puntarenas, Coto Brus) hacia (Alajuela, Na...",1.4,0,1,USD
8,TRANS_9,CLIENTE_1874,2023-03-17 08:44:22,26.26,2.5,"De (Puntarenas, Puntarenas) hacia (Cartago, La...",1.3,0,1,USD
9,TRANS_10,CLIENTE_2496,2022-07-22 12:15:09,207.72,2.5,"De (Alajuela, Grecia) hacia (Heredia, Belén)",1.7,0,1,USD
