In [1]:
# Import all libraries that will used for this project
import pandas as pd
import numpy as np
import datetime as dt
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Mount folder from google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Read dataset
df = pd.read_csv("/content/drive/MyDrive/Capstone/datasets/user_data_train.csv", sep=';')

In [6]:
# Check the data first
df.head()

Unnamed: 0.1,Unnamed: 0,uid,is_scammer,source,trx_date,report_date,registereddate,birthday,gender,job_position,...,dormancy_count_trx,kyc_total_failed,kyc_total_revoked,avg_topup_weight_1,avg_x2x_weight_1,avg_other_weight_1,centrality_outdegree_p2p,centrality_indegree_p2p,centrality_undirected_p2p,centrality_outdegree_sendmoney
0,22341,a265c05d-6ed6-4991-971e-6ffab1074379,1,CS_REPORT_SCAMMER,30/06/2021,01/07/2021,01/02/2020,01/11/1999,Female,PELAJAR / MAHASISWA,...,71,100.0,0.0,0,0,0,0.0,0.0,0.0,0.0
1,29180,4d6ebe72-f050-49fd-82fe-a8764bb7a1a2,0,CS_REPORT_VICTIM,03/06/2022,11/06/2022,01/09/2021,01/04/2002,Male,PELAJAR / MAHASISWA,...,26,0.0,0.0,1770833,1927083,416667,0.0,0.0,0.0,0.0
2,8678,08744e6e-fec5-4168-bf38-68438dec2f88,0,INCOMPLETE_CS_REPORT_VICTIM,14/01/2023,14/01/2023,01/06/2021,01/01/1988,Male,WIRASWASTA,...,1354,0.0,0.0,106041667,1046875,253125,141348700.0,2826975000.0,3533719000.0,0.0
3,34607,f2a873e9-1e28-4db6-9652-cede968ab5d5,0,CS_REPORT_VICTIM,02/11/2022,08/11/2022,01/04/2019,01/03/2004,Female,PELAJAR / MAHASISWA,...,190,20.0,0.0,10833333,7760417,26145833,70674370.0,70674370.0,141348700.0,97956030.0
4,5770,c78d7429-7173-4c60-b911-dd8b10246baa,0,INCOMPLETE_CS_REPORT_VICTIM,02/03/2023,02/03/2023,01/02/2023,01/01/1982,Male,WIRASWASTA,...,63,20.0,0.0,41666667,58333333,50833333,424046200.0,141348700.0,565395000.0,1959121000.0


In [7]:
#EDA
df.isna().sum()

Unnamed: 0                            0
uid                                   0
is_scammer                            0
source                                0
trx_date                              0
report_date                           0
registereddate                        0
birthday                              0
gender                                0
job_position                          0
is_verified                           0
aqc_freq_prepaid_mobile               0
aqc_mean_prepaid_mobile_amount        0
aqc_freq_topup                        0
aqc_freq_topup_within_7d              0
aqc_mean_topup_amount                 0
aqc_mean_topup_amount_7d              0
aqc_mean_topup_amount_30d             0
aqc_mean_topup_amount_90d             0
aqc_total_topup_amount_7d             0
aqc_total_topup_amount_90d            0
aqc_freq_x2x                          0
aqc_freq_x2x_within_60d               0
aqc_freq_x2x_within_90d               0
aqc_mean_x2x_amount                   0


In [8]:
# Drop uid column
df = df.rename(columns={"Unnamed: 0": "id"})
df1 = df.drop(columns=['uid', 'id'])

In [9]:
# Check data source
df['source'].value_counts()

CS_REPORT_SCAMMER               11919
CS_REPORT_VICTIM                10819
INCOMPLETE_CS_REPORT_VICTIM      9848
INCOMPLETE_CS_REPORT_SCAMMER     8131
Name: source, dtype: int64

In [12]:
# pylint: disable=invalid-name
def create(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # is kyc user
    X["is_verified"] = X["is_verified"]

    # Change trx_date, birthday, report_date, & registereddate format
    X['trx_date'] = pd.to_datetime(X['trx_date'], format="%d/%m/%Y")
    X['birthday'] = pd.to_datetime(X['birthday'], format="%d/%m/%Y")
    X['report_date'] = pd.to_datetime(X['report_date'], format="%d/%m/%Y")
    X['registereddate'] = pd.to_datetime(X['registereddate'], format="%d/%m/%Y")

    # account lifetime to incidents
    X["trx_date"] = X["trx_date"]
    X["registereddate"] = X["registereddate"]
    X["account_lifetime"] = (X["trx_date"] - X["registereddate"]).dt.days

    # if account_lifetime within 2 weeks OK, else None (invalid)
    account_lifetime_conditions = [
        (X["account_lifetime"] < -14),
        (X["account_lifetime"] >= -14) & (X["account_lifetime"] < 0),
        (X["account_lifetime"] >= 0),
    ]
    account_lifetime_choices = [np.nan, 0, X["account_lifetime"]]
    X["account_lifetime"] = np.select(
        account_lifetime_conditions,
        account_lifetime_choices,
        default=np.nan,
    )

    # age --> adjust it to given dataset!
    X["birthday"] = pd.to_datetime(X["birthday"], dayfirst=True, errors="coerce")
    current_year = pd.Timestamp.today().year
    X["age"] = current_year - X["birthday"].dt.year

    # feature creation
    X["count_trx_per_lifetime"] = X["dormancy_count_trx"] / (X["account_lifetime"] + 0.000001)
    X["max_gmt_pay_diff_days_per_lifetime"] = X["dormancy_max_gmt_pay_diff_days"] / (X["account_lifetime"] + 0.000001)
    X["freq_x2x_per_lifetime"] = X["aqc_freq_x2x"] / (X["account_lifetime"] + 0.000001)
    X["dormancy_max_gmt_pay_diff_days_per_count_trx"] = (X["dormancy_max_gmt_pay_diff_days"] / X["dormancy_count_trx"])

    return X

In [15]:
df2 = create(df1)

In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40717 entries, 0 to 40716
Data columns (total 50 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   is_scammer                                    40717 non-null  int64         
 1   source                                        40717 non-null  object        
 2   trx_date                                      40717 non-null  datetime64[ns]
 3   report_date                                   40717 non-null  datetime64[ns]
 4   registereddate                                40717 non-null  datetime64[ns]
 5   birthday                                      40717 non-null  datetime64[ns]
 6   gender                                        40717 non-null  object        
 7   job_position                                  40717 non-null  object        
 8   is_verified                                   40717 non-null  int6

In [17]:
df3 = df2.copy()
df3['uid'] = df['uid']
df3.to_csv("/content/drive/MyDrive/Capstone/datasets/user_data_train_feature_creator.csv", index=False, sep=';')