In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [None]:
client_measures = pd.read_csv('client_measures.csv')
transactions = pd.read_csv('transactions.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [None]:
transactions.shape

(737908, 44)

In [None]:
transactions.head()

Unnamed: 0,client_id,trxn_direction,trxn_date,trxn_ammount,fl_0,fl_1,fl_2,fl_3,fl_4,fl_5,...,fl_30,fl_31,fl_32,fl_33,fl_34,fl_35,fl_36,fl_37,fl_38,fl_39
0,23112,0,2023-12-26,2.79757,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,23112,1,2023-12-27,2.79757,0,0,1,0,0,0,...,1,1,0,0,0,0,0,0,0,0
2,20472,1,2023-12-01,2.00785,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,20472,0,2023-11-28,2.75568,0,1,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4,20472,0,2023-11-23,2.46082,0,0,0,1,0,0,...,1,1,0,1,0,0,0,0,0,1


In [None]:
print(transactions.columns)
print(transactions.dtypes)

Index(['client_id', 'trxn_direction', 'trxn_date', 'trxn_ammount', 'fl_0',
       'fl_1', 'fl_2', 'fl_3', 'fl_4', 'fl_5', 'fl_6', 'fl_7', 'fl_8', 'fl_9',
       'fl_10', 'fl_11', 'fl_12', 'fl_13', 'fl_14', 'fl_15', 'fl_16', 'fl_17',
       'fl_18', 'fl_19', 'fl_20', 'fl_21', 'fl_22', 'fl_23', 'fl_24', 'fl_25',
       'fl_26', 'fl_27', 'fl_28', 'fl_29', 'fl_30', 'fl_31', 'fl_32', 'fl_33',
       'fl_34', 'fl_35', 'fl_36', 'fl_37', 'fl_38', 'fl_39'],
      dtype='object')
client_id           int64
trxn_direction      int64
trxn_date          object
trxn_ammount      float64
fl_0                int64
fl_1                int64
fl_2                int64
fl_3                int64
fl_4                int64
fl_5                int64
fl_6                int64
fl_7                int64
fl_8                int64
fl_9                int64
fl_10               int64
fl_11               int64
fl_12               int64
fl_13               int64
fl_14               int64
fl_15               int64
fl_16

In [None]:
transactions['trxn_date'] = transactions['trxn_date'].str.replace("-","").astype(int)

In [None]:
transactions.columns[transactions.isna().any()].tolist()

[]

In [None]:
transactions = transactions.fillna(0.0)

In [None]:
transactions.columns[transactions.isna().any()].tolist()

[]

In [None]:
transactions.head()

Unnamed: 0,client_id,trxn_direction,trxn_date,trxn_ammount,fl_0,fl_1,fl_2,fl_3,fl_4,fl_5,...,fl_30,fl_31,fl_32,fl_33,fl_34,fl_35,fl_36,fl_37,fl_38,fl_39
0,23112,0,20231226,2.79757,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,23112,1,20231227,2.79757,0,0,1,0,0,0,...,1,1,0,0,0,0,0,0,0,0
2,20472,1,20231201,2.00785,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,20472,0,20231128,2.75568,0,1,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4,20472,0,20231123,2.46082,0,0,0,1,0,0,...,1,1,0,1,0,0,0,0,0,1


In [None]:
client_measures.head()

Unnamed: 0,client_id,days_from_start,reg_code,activity_code,index_1,index_2,index_3,index_4
0,1,185.0,79.0,49.41,,,,45.0
1,2,1932.0,79.0,43.21,,0.0,,
2,3,9.0,79.0,47.91,,0.0,,
3,4,19.0,79.0,41.2,,,,
4,5,48.0,80.0,1.41,,0.0,,


In [None]:
client_measures = client_measures.fillna(0.0)

In [None]:
client_measures.shape

(35199, 8)

In [None]:
print(client_measures.columns)
print(client_measures.dtypes)

Index(['client_id', 'days_from_start', 'reg_code', 'activity_code', 'index_1',
       'index_2', 'index_3', 'index_4'],
      dtype='object')
client_id            int64
days_from_start    float64
reg_code           float64
activity_code      float64
index_1            float64
index_2            float64
index_3             object
index_4            float64
dtype: object


In [None]:
client_measures.columns[client_measures.isna().any()].tolist()

[]

In [None]:
client_measures = pd.get_dummies(client_measures, columns=['index_3'], dtype=int)

In [None]:
client_measures

Unnamed: 0,client_id,days_from_start,reg_code,activity_code,index_1,index_2,index_4,index_3_0.0,index_3_l1,index_3_l2,index_3_l3
0,1,185.0,79.0,49.41,0.0,0.0,45.0,1,0,0,0
1,2,1932.0,79.0,43.21,0.0,0.0,0.0,1,0,0,0
2,3,9.0,79.0,47.91,0.0,0.0,0.0,1,0,0,0
3,4,19.0,79.0,41.20,0.0,0.0,0.0,1,0,0,0
4,5,48.0,80.0,1.41,0.0,0.0,0.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
35194,35339,10.0,45.0,46.74,0.0,0.0,61.0,1,0,0,0
35195,35340,13.0,45.0,46.73,0.0,0.0,66.0,0,0,0,1
35196,35341,69.0,45.0,46.90,0.0,0.0,48.0,0,0,1,0
35197,35342,7.0,45.0,43.21,0.0,0.0,61.0,1,0,0,0


In [None]:
client_measures.columns[client_measures.isna().any()].tolist()

[]

In [None]:
train.head()

Unnamed: 0,client_id,trigger_dt,target
0,2,2023-06-26,0
1,3,2023-04-23,0
2,4,2023-10-03,0
3,5,2023-07-06,0
4,6,2023-07-03,0


In [None]:
train.shape

(26005, 3)

In [None]:
print(train.columns)
print(train.dtypes)

Index(['client_id', 'trigger_dt', 'target'], dtype='object')
client_id      int64
trigger_dt    object
target         int64
dtype: object


In [None]:
train['trigger_dt'] = train['trigger_dt'].str.replace("-","").astype(int)

In [None]:
train.head()

Unnamed: 0,client_id,trigger_dt,target
0,2,20230626,0
1,3,20230423,0
2,4,20231003,0
3,5,20230706,0
4,6,20230703,0


In [None]:
train = pd.merge(train, client_measures, on='client_id')

In [None]:
train = pd.merge(train, transactions, on='client_id')

In [None]:
train

Unnamed: 0,client_id,trigger_dt,target,days_from_start,reg_code,activity_code,index_1,index_2,index_4,index_3_0.0,...,fl_30,fl_31,fl_32,fl_33,fl_34,fl_35,fl_36,fl_37,fl_38,fl_39
0,2,20230626,0,1932.0,79.0,43.21,0.0,0.0,0.0,1,...,1,0,0,0,1,0,0,0,0,0
1,2,20230626,0,1932.0,79.0,43.21,0.0,0.0,0.0,1,...,1,0,0,0,0,0,0,0,0,0
2,2,20230626,0,1932.0,79.0,43.21,0.0,0.0,0.0,1,...,1,1,0,0,0,0,0,0,0,0
3,3,20230423,0,9.0,79.0,47.91,0.0,0.0,0.0,1,...,1,0,0,0,0,0,0,0,0,0
4,3,20230423,0,9.0,79.0,47.91,0.0,0.0,0.0,1,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551845,35343,20231024,1,15.0,45.0,14.11,0.0,0.0,57.0,0,...,1,1,0,0,0,0,0,0,0,0
551846,35343,20231024,1,15.0,45.0,14.11,0.0,0.0,57.0,0,...,1,0,0,0,0,0,0,0,0,0
551847,35343,20231024,1,15.0,45.0,14.11,0.0,0.0,57.0,0,...,1,0,0,0,1,0,0,0,0,0
551848,35343,20231024,1,15.0,45.0,14.11,0.0,0.0,57.0,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
train['trigger_dt']

0         20230626
1         20230626
2         20230626
3         20230423
4         20230423
            ...   
551845    20231024
551846    20231024
551847    20231024
551848    20231024
551849    20231024
Name: trigger_dt, Length: 551850, dtype: int64

In [None]:
X = train.drop(columns=['target', 'client_id'])
y = train['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98     75282
           1       0.98      0.92      0.95     35088

    accuracy                           0.97    110370
   macro avg       0.97      0.96      0.96    110370
weighted avg       0.97      0.97      0.97    110370

[[74677   605]
 [ 2792 32296]]
ROC AUC Score: 0.9561960934773986
