In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd "/content/drive/MyDrive/Selection-of-criminals_with_syndata/CTGAN_syndata_generaton" 

/content/drive/MyDrive/Selection-of-criminals_with_syndata/CTGAN_syndata_generaton


# 4. Review Generated Data Similarity(Regression)

## Check Statistical Similarity using Regression Model

- To check whether synthetic data retains the statistical similarity of the original data, we will create a regression model.
- We will use previously generated synthetic import declaration dataset.
- It is composed of several attributes for import declaration, and attributes to check violations.
- After training three regression models of real and synthetic data, we will select data with high risk of crimes using the result.
- Generate data to check detection rate.
- Using each data's violation result, calculate data's detection rate.
- Using the result, check the statistical similarity.

## Import Library
- Load libraries.

In [4]:
import pandas as pd
import copy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
from matplotlib import pyplot as plt 

In [5]:
import time
import warnings
warnings.filterwarnings("ignore")

In [6]:
#Jupyter Cell Full Screen View
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# useful for debuging (print the results of both formulas and functions entered in one cell of Jupyter)
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"
# Each column width at maximum (print all column contents)
pd.set_option('display.max_colwidth', -1)
# Show up to 500 rows
pd.set_option('display.max_rows', 500)
# Display up to 500 columns
pd.set_option('display.max_columns', 500)
# Total length of data frame
pd.set_option('display.width', 1000)

print('ready to run')
# logging starttime 
startTime = time.time()

ready to run


## Declaring Functions

- Training models using several algorithms have a lot of repetitive tasks, so we will declare and use functions.
- In this exercise, there are a lot of repetitions while training each three models for two datasets. Therefore, we increased the readability of the code through functions.

### Declaring Variables
- Declare variables used for preprocessing.
- Separate category type and numeric type columns.

In [7]:
category_cols = ['imp_dec_code','dec_custom_code','imp_trd_code','imp_typ_code',\
                 'collect_code','typ_transport_code','dec_mark','importer','ovs_cust_code',\
                 'exps_carr_code','HS10','country_ship_code','country_orig_code','trff_class_code',\
                 'country_orig_mark_code','crime_yn','key_exposure']

In [8]:
number_cols = ['trff_rate','dec_weight','taxabal_price_KRW']

### Min-max Normalization Function
- Normalize using min-max normalization function.

In [9]:
def normalize(column):
    return (column - column.min())/(column.max() - column.min())

### Function for Data Preprocessing
- Execute functions for data preprocessing.

In [10]:
def df_preprocessing(df):
    # To use data safely, we copy the data.
    copy_df = copy.deepcopy(df)

    # Prepare for processing by separating data by data type
    copy_df_category = copy_df[category_cols]
    copy_df_number = copy_df[number_cols]
    
    #Load encoding object.
    encoder = LabelEncoder()

    #Encode dataframe holding categorical data.
    for column_name,item in copy_df_category.iteritems(): 
        encoder.fit(item)
        labels = encoder.transform(item)
        copy_df_category[column_name] = labels
        
    #Perform min-max normalization for the dataframe with numerical data.
    copy_df_number_norm = copy_df_number.apply(normalize)
    
    #Output two data.
    return copy_df_category, copy_df_number_norm

### Function that Separates Data into Training Data and Test Data

- We will use train_test_split, a preprocessing function included in scikit-learn.
- Target column is 'crime_yn', which denotes whether a crime occurred or not.
- Remove target column 'crime_yn' from the training columns.
- As part of training columns, 'key_exposure' has too high of correlation to 'crime_yn', making it hard to make use of other columns. Thus, we remove 'key_exposure' from the training columns.
- Remove 'imp_dec_code' from the training columns, which we have previously confirmed having low correlation.
- Finally, we split training data and test data by ratio of 8:2.

In [1]:
def df_splist(df):

    #Set target.
    y = df['crime_yn']
    #Delete data that have correlation too high/low to the target data.
    X = df.drop(columns=['crime_yn','key_exposure','imp_dec_code'])
    #Separate the data.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

### Function for Calculating Crime Rate Against the Group with Upper 5% Regression Value in the Given Data
- Calculate crime rate against the group with upper 5% regression value.

In [12]:
def reg_top_5per(df_reg, target_y):
    
    #Pick out data values that correspond to upper 5% using conditional expression, then insert into the result variable.
    top_5percent = df_reg['pred'].quantile(0.95)
    result = df_reg[df_reg['pred'] > top_5percent]
    #Pick out values that correspond to upper 5% index from the target values, then insert into the y_test_top5 variable.
    y_test_top5 = target_y[target_y.index.isin(result['index'])]
    #For quicker calculation, create a dataframe with 'crime_yn' column having y_test_top5 values.
    y_test_top5_df = pd.DataFrame({'crime_yn' :y_test_top5})
    #Calculate crime rate by dividing count of variables where crime_yn==1 by the total variable count 
    top_5_crimerate = y_test_top5_df[y_test_top5_df['crime_yn']==1].count() / y_test_top5_df.count()
    
    return top_5_crimerate

### Function that Calculates the Crime Rate of the Group with Top 10% Regression Value in the Given Data
- Calculates the crime rate in the group with top 10% regression value.

In [13]:
def reg_top_10per(df_reg, target_y):
    
    #Pick out data values that correspond to upper 10% using conditional expression, then insert into result variable.
    top_10percent = df_reg['pred'].quantile(0.90)
    result = df_reg[df_reg['pred'] > top_10percent]
    #Pick out values that correspond to upper 10% index from the target values, then insert into y_test_top10 variable.
    y_test_top10 = target_y[target_y.index.isin(result['index'])]
    #For quicker calculation, create a dataframe with 'crime_yn' column having y_test_top10 values.
    y_test_top10_df = pd.DataFrame({'crime_yn' :y_test_top10})
    #Calculate crime rate by dividing count of variables where crime_yn==1 by the total vriable count 
    top_10_crimerate = y_test_top10_df[y_test_top10_df['crime_yn']==1].count() / y_test_top10_df.count()
    
    return top_10_crimerate

## Loading Data / Preprocessing Data

- To train the AI model, we generally divide the training set and test set by 8:2 ratio.
- One set containes training data and target data.
- This means that we need 4 dataframes in order to train one model.
- Since we are going to train two dataset(real data and synthetic data), we are going to create 8 dataframes in this exercise.

Real data (X_train, X_test, y_train, y_test) -> 4 dataframes
<br>Synthetic data (X_syn_train, X_syn_test, y_syn_train, y_syn_test) -> 4 dataframes


### Real Data

- Load real data and perform preprocessing. Also separate into training set and test set.

In [14]:
df_base = pd.read_csv('df_syn_en.csv', encoding='utf-8-sig') #New synthetic data.

- We continue by copying the loaded data.

In [15]:
copy_base = copy.deepcopy(df_base)

- We use the preprocessing function to preprocess and separate mixed data.

In [16]:
copy_base_category, copy_base_number_norm = df_preprocessing(copy_base)

#### Combining Data

- Combine preprocessed data into a single table.

In [17]:
copy_base_total = pd.concat([copy_base_category,copy_base_number_norm], axis=1)

#### Separate Training Data and Test Data
- Separate into training data and test data.

In [18]:
X_train, X_test, y_train, y_test = df_splist(copy_base_total)

### Synthetic Data

- We repeat the process of data preprocessing and separation of training set and test set, similar to what we did previously against the real data. 

In [19]:
df_syn = pd.read_csv('./data_sample/df_syn_en_14.csv', encoding='utf-8-sig') #New synthetic data

- We continue by copying loaded data.

In [20]:
copy_syn = copy.deepcopy(df_syn)

- We use preprocessing functions to preprocess and separate mixed data.

In [21]:
copy_syn_category, copy_syn_number_norm = df_preprocessing(copy_syn)

#### Combining Data

- Combine preprocessed data into a single table.

In [22]:
copy_syn_total = pd.concat([copy_syn_category,copy_syn_number_norm], axis=1)

#### Separation of Training Data and Test Data
- Separate training data and test data.

In [23]:
X_syn_train, X_syn_test, y_syn_train, y_syn_test = df_splist(copy_syn_total)

## Evaluating Model
- The model outputs the probability of crime against each data as a value between 0 and 1.  We compared against the real crime rate of import declarations with top 5% and 10% of probability of crime, and used as the performance metric.

- This performance metric compared how similar the crime detection rate compares to the real data. Also, it is used to compare statistical similarity of the two dataset by comparing the crime detection rate of top 5% dataset and top 10% dataset.

### Linear Regression Analysis

- Linear regression is a statistical method used to model the relationship between a dependent variable and one or more independent variables.
- We assume linear relationship between the variables, and use method of least square to approximate regression parameters.
- Linear regression is used for prediction and inference in various fields, but we need to carefully consider assumptions and limitations. 

In [24]:
from sklearn.linear_model import LinearRegression

#### Real Data (X_train, X_test, y_train, y_test)

- We train real data for the regression model.

In [25]:
lr = LinearRegression()
lr.fit(X_train, y_train)

 R<sup>2</sup>
- A technique for calculating  R<sup>2</sup> score of regression.
- Closer the value is to 0, weaker linear relationship between x and y.
- We can see that there is a weak linear relationship between the independent variable and the dependent variable in this data.

In [26]:
score = lr.score(X_test, y_test)
print("Model score:", score)

Model score: 0.009809196173191426


Predicted value of real data

- Output predicted regression value by inputting test data into a model, which is trained with real data.
- The model outputs the probability of crime against each data as a value between 0 and 1, and used the crime detection rate of top 5% and 10% items as the evaluation metric.

In [27]:
y_pred = lr.predict(X_test)
index = X_test.index.tolist()
df = pd.DataFrame({'index' : index ,'pred': y_pred})

Crime detection rate of top 5% crime regression prediction value against real data

- We select top 5% data from the predicted values, and determine whether crime occurred or not.

In [28]:
base_5 = reg_top_5per(df,y_test)

Crime detection rate of top 10% crime regression prediction value against real data

- We select top 10% data from the predicted values, and determine whether crime occurred or not.

In [29]:
base_10 = reg_top_10per(df,y_test)

#### Synthetic Data (X_syn_train, X_syn_test, y_syn_train, y_syn_test)

Synthetic data regression model training
- Train regression model with synthetic data.

In [30]:
lr_syn = LinearRegression()
lr_syn.fit(X_syn_train, y_syn_train)

- As it has low value, we can see that the synthetic data also have weak linear relationship between the dependent variable and the independent variable.

In [31]:
score = lr_syn.score(X_syn_test, y_syn_test)
print("Model score:", score)

Model score: -0.027774698230059514


Predicted value for synthetic data

- Using the model trained with synthetic data, output predicted regression value by inputting test data.
- Model outputs the probability of crime against each data as a value between 0 and 1. The crime detection rates against top 5% and 10% items were used as the evaluation metric.

In [32]:
y_syn_pred = lr_syn.predict(X_syn_test)
index = X_syn_test.index.tolist()
df = pd.DataFrame({'index' : index ,'pred': y_syn_pred})

Detection rate of top 5% crime regression predicted value against synthetic data 
- Using data of only top 5% predicted value, determine whether a crime occurred or not.

In [33]:
syn_5 = reg_top_5per(df,y_syn_test)

Detection rate of top 10% crime regression predicted value against synthetic data 
- Using data of only top 10% predicted value, determine whether a crime occurred or not.

In [34]:
syn_10 = reg_top_10per(df,y_syn_test)

####  Detection Rate Summary - Linear Regression Analysis


- There is little difference in detection rate.
- Although there is little difference, detection rate decreases similarly going from 5% group to 10% group.
- We can infer that there is a similar distribution.

In [35]:
df_result_lr = pd.DataFrame({'category': 'LinearRegression','base_5' : base_5, 'base_10': base_10,'syn_5': syn_5,'syn_10': syn_10})

In [36]:
df_result_lr

Unnamed: 0,category,base_5,base_10,syn_5,syn_10
crime_yn,LinearRegression,0.353704,0.311111,0.375,0.28125


### Random Forest

Random forest is a machine learning algorithm that builds several decision trees and collects predictions. This is used to increase accuracy and reduce overfitting.

Here are the main characteristics:
- Random selection of samples and features for building each trees.
- Ensemble training for increasing predictability.
- Ability to process high-dimension large dataset.

In [37]:
from sklearn.ensemble import RandomForestRegressor

#### Real Data(X_train, X_test, y_train, y_test)

Real data random forest regression model training
- Train real data using random forest regression model.

In [38]:
rf = RandomForestRegressor(n_estimators=70, random_state=42)

In [39]:
rf.fit(X_train, y_train)

Real data predicted value.
- Calculate real data predicted value.

In [40]:
y_pred = rf.predict(X_test)

In [41]:
index = X_test.index.tolist()

In [42]:
df = pd.DataFrame({'index' : index ,'pred': y_pred})

Detection rate of top 5% crime regression predicted value against real data 
- Determine whether crime occurred or not by selecting only top 5% predicted value.

In [43]:
base_5 = reg_top_5per(df,y_test)

Detection rate of top 10% crime regression predicted value against real data 
- Determine whether crime occurred or not by selecting only top 10% predicted value.

In [44]:
base_10 = reg_top_10per(df,y_test)

#### Synthetic Data(X_syn_train, X_syn_test, y_syn_train, y_syn_test)

Synthetic data random forest regression model training
- Train synthetic data using random forest regression model.

In [45]:
rf_syn = RandomForestRegressor(n_estimators=70, random_state=42)

In [46]:
rf_syn.fit(X_syn_train, y_syn_train)

Synthetic data predicted value
- Calculate Synthetic data predicted value.

In [47]:
y_syn_pred = rf_syn.predict(X_syn_test)

In [48]:
index = X_syn_test.index.tolist()

In [49]:
df = pd.DataFrame({'index' : index ,'pred': y_syn_pred})

Detection rate of top 5% crime regression predicted value against synthetic data 
- Determine whether crime occurred or not by selecting only top 5% predicted value.

In [50]:
syn_5 = reg_top_5per(df,y_syn_test)

Detection rate of top 10% crime regression predicted value against synthetic data 
- Determine whether crime occurred or not by selecting only top 10% predicted value.

In [51]:
syn_10 = reg_top_10per(df,y_syn_test)

#### Detection Rate Summary - Random Forest

- We can see that synthetic data works well even for random forest.
- Similar to regression analysis, we see that the detection rate is similar. Also, we see that the detection rate decrease similarly from 5% group to 10% group.

In [52]:
df_result_rf = pd.DataFrame({'category': 'RandomForestRegressor','base_5' : base_5, 'base_10': base_10,'syn_5': syn_5,'syn_10': syn_10})

In [53]:
df_result_rf

Unnamed: 0,category,base_5,base_10,syn_5,syn_10
crime_yn,RandomForestRegressor,0.937618,0.871747,0.733333,0.433333


### xgBoost

Last algorithm is xgBoost. 

xgBoost is a powerful machine learning algorithm designed to improve the performance of gradient boosting algorithm.

Main features is as follows:

- Parallel processing and hardware optimization for fast and efficient model training.
- Overfitting prevention and normalization technology for better generalization.
- By default provides handling for missing values and feature selection processing.

In [54]:
import xgboost as xgb

#### Real Data (X_train, X_test, y_train, y_test)

Real data xgBoost regression model training
- Train real data using xgBoost regression model.

In [55]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [56]:
params = {
    "max_depth": 2,
    "eta": 0.1,
    "subsample": 0.5,
    "colsample_bytree": 0.5,
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
}
num_round = 100

In [57]:
xgb_model = xgb.train(params, dtrain, num_round)

Real data predicted value
- Calculate real data predicted value.

In [58]:
y_pred = xgb_model.predict(dtest)

In [59]:
index = X_test.index.tolist()

In [60]:
df = pd.DataFrame({'index' : index ,'pred': y_pred})

Detection rate of top 5% crime regression predicted value against real data 
- Determine whether crime occurred or not by selecting only top 5% predicted value.

In [61]:
base_5 = reg_top_5per(df,y_test)

Detection rate of top 10% crime regression predicted value against real data 
- Determine whether crime occurred or not by selecting only top 10% predicted value.

In [62]:
base_10 = reg_top_10per(df,y_test)

#### Synthetic Data (X_syn_train, X_syn_test, y_syn_train, y_syn_test)

Synthetic data xgBoost regression model training
- Unlike other models, we perform training by inputting DMatrix type data as part of xgb.

In [63]:
dtrain_syn = xgb.DMatrix(X_syn_train, label=y_syn_train)
dtest_syn = xgb.DMatrix(X_syn_test, label=y_syn_test)

Parameters for training the model.

We can also find appropriate training method by adjusting parameter values.

In [64]:
params = {
    "max_depth": 2,
    "eta": 0.1,
    "subsample": 0.5,
    "colsample_bytree": 0.5,
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
}
num_round = 100

Train by inputting parameters, data and execution count.

In [65]:
xgb_model_syn = xgb.train(params, dtrain_syn, num_round)

Synthetic data predicted value.
- Calculate synthetic data predicted value.

In [66]:
y_syn_pred = xgb_model_syn.predict(dtest_syn)

In [67]:
index = X_syn_test.index.tolist()

In [68]:
df = pd.DataFrame({'index' : index ,'pred': y_syn_pred})

Detection rate of top 5% crime regression predicted value against synthetic data 
- Determine whether crime occurred or not by selecting only top 5% predicted value.

In [69]:
syn_5 = reg_top_5per(df,y_syn_test)

Detection rate of top 10% crime regression predicted value against synthetic data 
- Determine whether crime occurred or not by selecting only top 10% predicted value.

In [70]:
syn_10 = reg_top_10per(df,y_syn_test)

#### Detection Rate Summary - xgBoost 


- Detection rate decreases proportionally from 5% group to 10% group.

In [71]:
df_result_xgb = pd.DataFrame({'category': 'xgboost','base_5' : base_5, 'base_10': base_10,'syn_5': syn_5,'syn_10': syn_10})

In [72]:
df_result_xgb

Unnamed: 0,category,base_5,base_10,syn_5,syn_10
crime_yn,xgboost,0.401852,0.390741,0.5625,0.375


## Detection Rate Summary for Each Model

- We summarized detection rate using representative models.
- We can see that the detection rate in 5% group and 10% group decreased when using synthetic data.
- This means that we can get results with similar trends to real data when you use the same algorithm to perform training.
- These numbers are only one example.
- The statistical properties could change by various parameters such as amount of extracted data for CTGAN, or amount of generated data.
- We need to find appropriate algorithm and parameter for each project.

In [73]:
total_result = pd.concat([df_result_lr, df_result_rf, df_result_xgb])

In [74]:
total_result = total_result.reset_index(drop=True)

In [75]:
total_result

Unnamed: 0,category,base_5,base_10,syn_5,syn_10
0,LinearRegression,0.353704,0.311111,0.375,0.28125
1,RandomForestRegressor,0.937618,0.871747,0.733333,0.433333
2,xgboost,0.401852,0.390741,0.5625,0.375
