In [1]:
"""
This replication will add the dealing of missing values
Steps that are identical to the previous replication are grouped together
"""

In [1]:
# Basic packages to manipulate data
import numpy as np
import pandas as pd

# Read data
# file 'fraud_machinelearning.csv' is supposedly stored under the current directory
# data is a type of DataFrame, and contains all the observations and features
data = pd.read_csv("./fraud_machinelearning.csv")

# Only 24 features were used in the paper to make the prediction
# We create a list that contains the names of these 24 features, of identifier, and of label/fraud
first_col = ['GVKEY', 'FYEAR', 'AAER_fraud', 'CHE', 'RECT', 'INVT', 'STI', 'PPEGT', 'IVAO', 'AT', 
             'DLC', 'TXP', 'LCT', 'DLTT', 'LT', 'SEQ', 'PSTK', 'RE', 'SALE', 'COGS',
             'DP', 'XINT', 'TXT', 'IB', 'NI', 'PRCC_F', 'CSHO']

# data_first is a new DataFrame that contains only variables that we will use
data_first  = data[first_col].copy(deep=True)

# In this replication, we will use observations between 1991 and 2001 as training set
# "FYEAR" is the identifier to select the observations
train_91_01 = data_first[data_first.FYEAR > 1990]
train_91_01 = train_91_01[train_91_01.FYEAR < 2002]

# We will use observations in 2003 as test set
test_03 = data_first[data_first.FYEAR == 2003]

# Create a new list that contains only the names of the 24 features
first_col_features = []
for i in first_col:
    first_col_features.append(i)
first_col_features.remove("GVKEY")
first_col_features.remove("FYEAR")
first_col_features.remove("AAER_fraud")

In [2]:
# Here the new part starts
from sklearn.preprocessing import Imputer

In [3]:
# In this example, the observations that we used to impute missing values are the ones
# that are in the same year(FYEAR)
# We take a simple approach: replace missing value with the most frequent value of the feature

# Impute training set
imp_lis_freq = []
for i in train_91_01.FYEAR.unique():
    imr_train_freq = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
    imr_train_freq = imr_train_freq.fit(train_91_01[train_91_01.FYEAR == i])
    imputed_train_freq = imr_train_freq.transform(train_91_01[train_91_01.FYEAR == i].values)
    imp_lis_freq.append(imputed_train_freq)
    
imputed_train_freq = pd.DataFrame(imp_lis_freq[0])
imputed_train_freq.columns = train_91_01.columns

for i in range(1, 11):
    imputed_tmp_freq = pd.DataFrame(imp_lis_freq[i])
    imputed_tmp_freq.columns = train_91_01.columns
    imputed_train_freq = imputed_train_freq.append(imputed_tmp_freq)

# Impute test set
imr_test_freq = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imr_test_freq = imr_test_freq.fit(test_03)
imputed_test_freq = imr_test_freq.transform(test_03.values)
imputed_test_freq = pd.DataFrame(imputed_test_freq)
imputed_test_freq.columns = test_03.columns

In [4]:
# Separate train and test set into feature and label
fraud_freq_91_01 = imputed_train_freq.AAER_fraud
feature_freq_91_01 = imputed_train_freq[first_col_features]

fraud_freq_03 = imputed_test_freq.AAER_fraud
feature_freq_03 = imputed_test_freq[first_col_features]

In [5]:
# Use Random Forest from Scikit-learn as classifier
# The reason to choose Random Forst is because its implementation in Scikit-learn
# has the option to utilize multi-core CPU processing
from sklearn.ensemble import RandomForestClassifier

# Set up the basic parameters: build up 500 trees, and use 4 CPU cores for the process
clf_RF = RandomForestClassifier(n_estimators=500, n_jobs=4)

# train/fit the classifier
clf_RF.fit(feature_freq_91_01, fraud_freq_91_01)

# Use area under the Receiver Operating Characteristics (ROC) curve (AUC) 
# to evaluate the performance
from sklearn.metrics import roc_auc_score

# After training, classifier(clf_RF) can predict the class label of a given observation
# Or it can predict the probability that the given observations belong to each class
# We use the predicted probability.
print("The AUC of the classifier on 1991~2001 training samples is %.3f" 
       %(roc_auc_score(fraud_freq_91_01, clf_RF.predict_proba(feature_freq_91_01)[:,1])))
print("The AUC of the classifier on 2003 test year is %.3f" 
       %(roc_auc_score(fraud_freq_03, clf_RF.predict_proba(feature_freq_03)[:,1])))

The AUC of the classifier on 1991~2001 training samples is 1.000
The AUC of the classifier on 2003 test year is 0.745


In [None]:
# Comparing with deleting missing value, imputation seems to 
# improve the performance: test set AUC increases to 0.736
# There are two other simple imputation approaches: replacing missing value with the mean or median value
# of observations considered
# Next step is about tuning parameters with Cross-Validation