In [1]:
# Import all libraries that will used for this project
import pandas as pd
import numpy as np
import datetime as dt
import re
# import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
# Read dataset
df = pd.read_csv("user_data_train.csv", sep=';')

In [12]:
# Check the data first
df.head()

Unnamed: 0.1,Unnamed: 0,uid,is_scammer,source,trx_date,report_date,registereddate,birthday,gender,job_position,...,dormancy_count_trx,kyc_total_failed,kyc_total_revoked,avg_topup_weight_1,avg_x2x_weight_1,avg_other_weight_1,centrality_outdegree_p2p,centrality_indegree_p2p,centrality_undirected_p2p,centrality_outdegree_sendmoney
0,22341,a265c05d-6ed6-4991-971e-6ffab1074379,1,CS_REPORT_SCAMMER,30/06/2021,01/07/2021,01/02/2020,01/11/1999,Female,PELAJAR / MAHASISWA,...,71,100.0,0.0,0,0,0,0.0,0.0,0.0,0.0
1,29180,4d6ebe72-f050-49fd-82fe-a8764bb7a1a2,0,CS_REPORT_VICTIM,03/06/2022,11/06/2022,01/09/2021,01/04/2002,Male,PELAJAR / MAHASISWA,...,26,0.0,0.0,1770833,1927083,416667,0.0,0.0,0.0,0.0
2,8678,08744e6e-fec5-4168-bf38-68438dec2f88,0,INCOMPLETE_CS_REPORT_VICTIM,14/01/2023,14/01/2023,01/06/2021,01/01/1988,Male,WIRASWASTA,...,1354,0.0,0.0,106041667,1046875,253125,141348700.0,2826975000.0,3533719000.0,0.0
3,34607,f2a873e9-1e28-4db6-9652-cede968ab5d5,0,CS_REPORT_VICTIM,02/11/2022,08/11/2022,01/04/2019,01/03/2004,Female,PELAJAR / MAHASISWA,...,190,20.0,0.0,10833333,7760417,26145833,70674370.0,70674370.0,141348700.0,97956030.0
4,5770,c78d7429-7173-4c60-b911-dd8b10246baa,0,INCOMPLETE_CS_REPORT_VICTIM,02/03/2023,02/03/2023,01/02/2023,01/01/1982,Male,WIRASWASTA,...,63,20.0,0.0,41666667,58333333,50833333,424046200.0,141348700.0,565395000.0,1959121000.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40717 entries, 0 to 40716
Data columns (total 46 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       40717 non-null  int64  
 1   uid                              40717 non-null  object 
 2   is_scammer                       40717 non-null  int64  
 3   source                           40717 non-null  object 
 4   trx_date                         40717 non-null  object 
 5   report_date                      40717 non-null  object 
 6   registereddate                   40717 non-null  object 
 7   birthday                         40717 non-null  object 
 8   gender                           40717 non-null  object 
 9   job_position                     40717 non-null  object 
 10  is_verified                      40717 non-null  int64  
 11  aqc_freq_prepaid_mobile          40717 non-null  int64  
 12  aqc_mean_prepaid_m

In [15]:
df = df.drop(columns=['Unnamed: 0'])

In [16]:
# pylint: disable=invalid-name
def create(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # is kyc user
    X["is_verified"] = X["is_verified"]

    # Change trx_date, birthday, report_date, & registereddate format
    X['trx_date'] = pd.to_datetime(X['trx_date'], format="%d/%m/%Y")
    X['birthday'] = pd.to_datetime(X['birthday'], format="%d/%m/%Y")
    X['report_date'] = pd.to_datetime(X['report_date'], format="%d/%m/%Y")
    X['registereddate'] = pd.to_datetime(X['registereddate'], format="%d/%m/%Y")

    # account lifetime to incidents
    X["trx_date"] = X["trx_date"]
    X["registereddate"] = X["registereddate"]
    X["account_lifetime"] = (X["trx_date"] - X["registereddate"]).dt.days

    # if account_lifetime within 2 weeks OK, else None (invalid)
    account_lifetime_conditions = [
        (X["account_lifetime"] < -14),
        (X["account_lifetime"] >= -14) & (X["account_lifetime"] < 0),
        (X["account_lifetime"] >= 0),
    ]
    account_lifetime_choices = [np.nan, 0, X["account_lifetime"]]
    X["account_lifetime"] = np.select(
        account_lifetime_conditions,
        account_lifetime_choices,
        default=np.nan,
    )

    # age --> adjust it to given dataset!
    X["birthday"] = pd.to_datetime(X["birthday"], dayfirst=True, errors="coerce")
    current_year = pd.Timestamp.today().year
    X["age"] = current_year - X["birthday"].dt.year

    # feature creation
    X["count_trx_per_lifetime"] = X["dormancy_count_trx"] / (X["account_lifetime"] + 0.000001)
    X["max_gmt_pay_diff_days_per_lifetime"] = X["dormancy_max_gmt_pay_diff_days"] / (X["account_lifetime"] + 0.000001)
    X["freq_x2x_per_lifetime"] = X["aqc_freq_x2x"] / (X["account_lifetime"] + 0.000001)
    X["dormancy_max_gmt_pay_diff_days_per_count_trx"] = (X["dormancy_max_gmt_pay_diff_days"] / X["dormancy_count_trx"])

    return X

In [17]:
df2 = create(df)

In [19]:
df2

Unnamed: 0,uid,is_scammer,source,trx_date,report_date,registereddate,birthday,gender,job_position,is_verified,...,centrality_outdegree_p2p,centrality_indegree_p2p,centrality_undirected_p2p,centrality_outdegree_sendmoney,account_lifetime,age,count_trx_per_lifetime,max_gmt_pay_diff_days_per_lifetime,freq_x2x_per_lifetime,dormancy_max_gmt_pay_diff_days_per_count_trx
0,a265c05d-6ed6-4991-971e-6ffab1074379,1,CS_REPORT_SCAMMER,2021-06-30,2021-07-01,2020-02-01,1999-11-01,Female,PELAJAR / MAHASISWA,1,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,515.0,24,0.137864,6.038835,0.005825,43.802817
1,4d6ebe72-f050-49fd-82fe-a8764bb7a1a2,0,CS_REPORT_VICTIM,2022-06-03,2022-06-11,2021-09-01,2002-04-01,Male,PELAJAR / MAHASISWA,1,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,275.0,21,0.094545,3.890909,0.170909,41.153846
2,08744e6e-fec5-4168-bf38-68438dec2f88,0,INCOMPLETE_CS_REPORT_VICTIM,2023-01-14,2023-01-14,2021-06-01,1988-01-01,Male,WIRASWASTA,1,...,1.413487e+08,2.826975e+09,3.533719e+09,0.000000e+00,592.0,35,2.287162,1.047297,0.942568,0.457903
3,f2a873e9-1e28-4db6-9652-cede968ab5d5,0,CS_REPORT_VICTIM,2022-11-02,2022-11-08,2019-04-01,2004-03-01,Female,PELAJAR / MAHASISWA,1,...,7.067437e+07,7.067437e+07,1.413487e+08,9.795603e+07,1311.0,19,0.144928,3.867277,0.045004,26.684211
4,c78d7429-7173-4c60-b911-dd8b10246baa,0,INCOMPLETE_CS_REPORT_VICTIM,2023-03-02,2023-03-02,2023-02-01,1982-01-01,Male,WIRASWASTA,1,...,4.240462e+08,1.413487e+08,5.653950e+08,1.959121e+09,29.0,41,2.172414,0.689655,1.724138,0.317460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40712,e191689a-09b1-4cbc-886c-d85ab9c797ed,0,CS_REPORT_VICTIM,2021-10-23,2021-11-03,2019-02-01,1986-05-01,Male,KARYAWAN SWASTA,1,...,0.000000e+00,0.000000e+00,0.000000e+00,1.632600e+08,995.0,37,0.543719,0.351759,0.240201,0.646950
40713,1453c0f3-812d-4d9c-ac8c-c316ad72c5b6,0,CS_REPORT_VICTIM,2021-05-07,2021-05-11,2018-12-01,2003-06-01,Male,PELAJAR / MAHASISWA,1,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,888.0,20,0.012387,2.387387,0.027027,192.727273
40714,55dcf997-3706-44f1-859f-b7cd8fc6f9fa,0,INCOMPLETE_CS_REPORT_VICTIM,2023-02-08,2023-02-08,2020-09-01,1980-09-01,Female,BELUM / TIDAK BEKERJA,1,...,5.583276e+09,2.332254e+10,6.714066e+09,1.306080e+09,890.0,43,1.348315,0.573034,0.891011,0.425000
40715,7461b796-ea62-4431-ba8a-9464638c1e77,0,CS_REPORT_VICTIM,2021-10-02,2021-10-17,2018-09-01,1993-01-01,Female,PELAJAR / MAHASISWA,1,...,0.000000e+00,7.067437e+07,7.067437e+07,3.265201e+08,1127.0,30,0.316770,0.523514,0.214729,1.652661


In [21]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40717 entries, 0 to 40716
Data columns (total 51 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   uid                                           40717 non-null  object        
 1   is_scammer                                    40717 non-null  int64         
 2   source                                        40717 non-null  object        
 3   trx_date                                      40717 non-null  datetime64[ns]
 4   report_date                                   40717 non-null  datetime64[ns]
 5   registereddate                                40717 non-null  datetime64[ns]
 6   birthday                                      40717 non-null  datetime64[ns]
 7   gender                                        40717 non-null  object        
 8   job_position                                  40717 non-null  obje

In [22]:
# df3 = df2.copy()
# df3['uid'] = df['uid']
df2.to_csv("user_data_train_feature_creator.csv", index=False, sep=';')