In [8]:
# Import all libraries that will used for this project
import pandas as pd
import numpy as np
import datetime as dt
import re
# import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
# Read dataset
df = pd.read_csv("user_data_test.csv", sep=',')

In [12]:
# Check the data first
df.head()

Unnamed: 0.1,Unnamed: 0,uid,is_scammer,source,trx_date,report_date,registereddate,birthday,gender,job_position,...,dormancy_count_trx,kyc_total_failed,kyc_total_revoked,avg_topup_weight_1,avg_x2x_weight_1,avg_other_weight_1,centrality_outdegree_p2p,centrality_indegree_p2p,centrality_undirected_p2p,centrality_outdegree_sendmoney
0,10,a4377cf6-8fe3-408f-b438-534671da6ed1,0,CS_REPORT_VICTIM,8/2/2021,20/02/2021,1/3/2019,1/7/1938,Female,MENGURUS RUMAH TANGGA,...,573,30.0,0.0,104167,52083,208333,0.0,0.0,0.0,0.0
1,44,b5c4d8ab-5d0e-483e-8c74-968e90efd07f,1,INCOMPLETE_CS_REPORT_SCAMMER,17/01/2023,17/01/2023,1/10/2022,1/7/1945,Male,PETANI / PEKEBUN,...,23,0.0,0.0,6145833,11666667,6666667,70674374.89,141348700.0,212023100.0,326520100.0
2,45,36a93fc1-b26c-4d34-be3e-404d233b1cbe,1,CS_REPORT_SCAMMER,21/12/2022,3/1/2023,1/1/2020,1/8/1945,Male,PETANI / PEKEBUN,...,57,100.0,0.0,416667,1979167,0,0.0,0.0,0.0,0.0
3,69,88296630-c0b9-46eb-b41c-829edc38a101,1,INCOMPLETE_CS_REPORT_SCAMMER,3/3/2023,3/3/2023,1/8/2021,1/10/1947,Female,MENGURUS RUMAH TANGGA,...,118,0.0,0.0,375,5046875,15104167,0.0,35125160000.0,35125160000.0,653040.2
4,88,b91c38fe-3061-409a-9a28-dff26a4587a4,1,CS_REPORT_SCAMMER,3/12/2022,4/12/2022,1/9/2022,1/7/1949,Female,MENGURUS RUMAH TANGGA,...,78,60.0,0.0,3802083,6927083,833333,0.0,0.0,0.0,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2142 entries, 0 to 2141
Data columns (total 54 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       2142 non-null   int64  
 1   trx_date                         2142 non-null   float64
 2   report_date                      2142 non-null   float64
 3   registereddate                   2142 non-null   float64
 4   birthday                         2142 non-null   float64
 5   is_verified                      2142 non-null   int64  
 6   aqc_freq_prepaid_mobile          2142 non-null   float64
 7   aqc_mean_prepaid_mobile_amount   2142 non-null   float64
 8   aqc_freq_topup                   2142 non-null   float64
 9   aqc_freq_topup_within_7d         2142 non-null   float64
 10  aqc_mean_topup_amount            2142 non-null   float64
 11  aqc_mean_topup_amount_7d         2142 non-null   float64
 12  aqc_mean_topup_amoun

In [13]:
df = df.drop(columns=['Unnamed: 0'])

In [14]:
# pylint: disable=invalid-name
def create(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # is kyc user
    X["is_verified"] = X["is_verified"]

    # Change trx_date, birthday, report_date, & registereddate format
    X['trx_date'] = pd.to_datetime(X['trx_date'], format="%d/%m/%Y")
    X['birthday'] = pd.to_datetime(X['birthday'], format="%d/%m/%Y")
    X['report_date'] = pd.to_datetime(X['report_date'], format="%d/%m/%Y")
    X['registereddate'] = pd.to_datetime(X['registereddate'], format="%d/%m/%Y")

    # account lifetime to incidents
    X["trx_date"] = X["trx_date"]
    X["registereddate"] = X["registereddate"]
    X["account_lifetime"] = (X["trx_date"] - X["registereddate"]).dt.days

    # if account_lifetime within 2 weeks OK, else None (invalid)
    account_lifetime_conditions = [
        (X["account_lifetime"] < -14),
        (X["account_lifetime"] >= -14) & (X["account_lifetime"] < 0),
        (X["account_lifetime"] >= 0),
    ]
    account_lifetime_choices = [np.nan, 0, X["account_lifetime"]]
    X["account_lifetime"] = np.select(
        account_lifetime_conditions,
        account_lifetime_choices,
        default=np.nan,
    )

    # age --> adjust it to given dataset!
    X["birthday"] = pd.to_datetime(X["birthday"], dayfirst=True, errors="coerce")
    current_year = pd.Timestamp.today().year
    X["age"] = current_year - X["birthday"].dt.year

    # feature creation
    X["count_trx_per_lifetime"] = X["dormancy_count_trx"] / (X["account_lifetime"] + 0.000001)
    X["max_gmt_pay_diff_days_per_lifetime"] = X["dormancy_max_gmt_pay_diff_days"] / (X["account_lifetime"] + 0.000001)
    X["freq_x2x_per_lifetime"] = X["aqc_freq_x2x"] / (X["account_lifetime"] + 0.000001)
    X["dormancy_max_gmt_pay_diff_days_per_count_trx"] = (X["dormancy_max_gmt_pay_diff_days"] / X["dormancy_count_trx"])

    return X

In [15]:
df2 = create(df)

In [16]:
df2

Unnamed: 0,uid,is_scammer,source,trx_date,report_date,registereddate,birthday,gender,job_position,is_verified,...,centrality_outdegree_p2p,centrality_indegree_p2p,centrality_undirected_p2p,centrality_outdegree_sendmoney,account_lifetime,age,count_trx_per_lifetime,max_gmt_pay_diff_days_per_lifetime,freq_x2x_per_lifetime,dormancy_max_gmt_pay_diff_days_per_count_trx
0,a4377cf6-8fe3-408f-b438-534671da6ed1,0,CS_REPORT_VICTIM,2021-02-08,2021-02-20,2019-03-01,1938-07-01,Female,MENGURUS RUMAH TANGGA,1,...,0.00,0.000000e+00,0.000000e+00,0.000000e+00,710.0,85,0.807042,5.971831,0.770423,7.399651
1,b5c4d8ab-5d0e-483e-8c74-968e90efd07f,1,INCOMPLETE_CS_REPORT_SCAMMER,2023-01-17,2023-01-17,2022-10-01,1945-07-01,Male,PETANI / PEKEBUN,1,...,70674374.89,1.413487e+08,2.120231e+08,3.265201e+08,108.0,78,0.212963,3.981481,0.148148,18.695652
2,36a93fc1-b26c-4d34-be3e-404d233b1cbe,1,CS_REPORT_SCAMMER,2022-12-21,2023-01-03,2020-01-01,1945-08-01,Male,PETANI / PEKEBUN,1,...,0.00,0.000000e+00,0.000000e+00,0.000000e+00,1085.0,78,0.052535,0.857143,0.022120,16.315789
3,88296630-c0b9-46eb-b41c-829edc38a101,1,INCOMPLETE_CS_REPORT_SCAMMER,2023-03-03,2023-03-03,2021-08-01,1947-10-01,Female,MENGURUS RUMAH TANGGA,1,...,0.00,3.512516e+10,3.512516e+10,6.530402e+05,579.0,76,0.203800,1.191710,0.167530,5.847458
4,b91c38fe-3061-409a-9a28-dff26a4587a4,1,CS_REPORT_SCAMMER,2022-12-03,2022-12-04,2022-09-01,1949-07-01,Female,MENGURUS RUMAH TANGGA,1,...,0.00,0.000000e+00,0.000000e+00,0.000000e+00,93.0,74,0.838710,3.225806,0.634409,3.846154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2137,fb77cca3-d9d8-4d80-bab7-63d0fdb92183,1,CS_REPORT_SCAMMER,2021-06-18,2021-07-03,2021-06-01,1900-01-01,,LAINNYA,0,...,0.00,0.000000e+00,0.000000e+00,0.000000e+00,17.0,123,1.000000,0.000000,0.529412,0.000000
2138,51e6482a-f277-434f-b42c-641637856554,1,CS_REPORT_SCAMMER,2022-04-26,2022-05-02,2022-04-01,1900-01-01,,LAINNYA,0,...,0.00,0.000000e+00,0.000000e+00,0.000000e+00,25.0,123,2.800000,0.400000,0.000000,0.142857
2139,3e9c9ba5-e85d-4c39-a609-0dcb8896e409,1,CS_REPORT_SCAMMER,2022-04-19,2022-04-23,2022-04-01,1900-01-01,,LAINNYA,0,...,0.00,0.000000e+00,0.000000e+00,0.000000e+00,18.0,123,0.944444,0.000000,0.000000,0.000000
2140,b0d62892-ecad-442d-b2c1-56666b673e36,0,CS_REPORT_VICTIM,2021-04-05,2021-04-13,2018-09-01,1900-01-01,,LAINNYA,0,...,0.00,0.000000e+00,0.000000e+00,0.000000e+00,947.0,123,0.026399,2.914467,0.000000,110.400000


In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2142 entries, 0 to 2141
Data columns (total 51 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   uid                                           2142 non-null   object        
 1   is_scammer                                    2142 non-null   int64         
 2   source                                        2142 non-null   object        
 3   trx_date                                      2142 non-null   datetime64[ns]
 4   report_date                                   2142 non-null   datetime64[ns]
 5   registereddate                                2142 non-null   datetime64[ns]
 6   birthday                                      2142 non-null   datetime64[ns]
 7   gender                                        1853 non-null   object        
 8   job_position                                  2142 non-null   object

In [18]:
# df3 = df2.copy()
# df3['uid'] = df['uid']
df2.to_csv("user_data_test_feature_creator.csv", index=False, sep=';')