In [None]:
"""
This is the most basic replication
"""

In [1]:
# Basic packages to manipulate data
import numpy as np
import pandas as pd

In [2]:
# Read data
# file 'fraud_machinelearning.csv' is supposedly stored under the current directory
# data is a type of DataFrame, and contains all the observations and features
data = pd.read_csv("./fraud_machinelearning.csv")

In [3]:
# Only 24 features were used in the paper to make the prediction
# We create a list that contains the names of these 24 features, of identifier, and of label/fraud
first_col = ['GVKEY', 'FYEAR', 'AAER_fraud', 'CHE', 'RECT', 'INVT', 'STI', 'PPEGT', 'IVAO', 'AT', 
             'DLC', 'TXP', 'LCT', 'DLTT', 'LT', 'SEQ', 'PSTK', 'RE', 'SALE', 'COGS',
             'DP', 'XINT', 'TXT', 'IB', 'NI', 'PRCC_F', 'CSHO']

In [4]:
# data_first is a new DataFrame that contains only variables that we will use
data_first  = data[first_col].copy(deep=True)

In [5]:
# In this replication, we will use observations between 1991 and 2001 as training set
# "FYEAR" is the identifier to select the observations
train_91_01 = data_first[data_first.FYEAR > 1990]
train_91_01 = train_91_01[train_91_01.FYEAR < 2002]

# We will use observations in 2003 as test set
test_03 = data_first[data_first.FYEAR == 2003]

In [6]:
# Drop observations that have missing value on any of their features
train_91_01 = train_91_01.dropna(axis=0, how='any')
test_03 = test_03.dropna(axis=0, how='any')

In [7]:
# Create a new list that contains only the names of the 24 features
first_col_features = []
for i in first_col:
    first_col_features.append(i)
first_col_features.remove("GVKEY")
first_col_features.remove("FYEAR")
first_col_features.remove("AAER_fraud")

In [8]:
# Split training and test sets into features and label/fraud
feature_91_01 = train_91_01[first_col_features]
fraud_91_01 = train_91_01.AAER_fraud

feature_03 = test_03[first_col_features]
fraud_03 = test_03.AAER_fraud

In [9]:
# Use Random Forest from Scikit-learn as classifier
# The reason to choose Random Forst is because its implementation in Scikit-learn
# has the option to utilize multi-core CPU processing
from sklearn.ensemble import RandomForestClassifier

In [10]:
# Set up the basic parameters: build up 500 trees, and use 4 CPU cores for the process
clf_RF = RandomForestClassifier(n_estimators=500, n_jobs=4)

In [11]:
# train/fit the classifier
clf_RF.fit(feature_91_01, fraud_91_01)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=4, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [12]:
# Use area under the Receiver Operating Characteristics (ROC) curve (AUC) 
# to evaluate the performance
from sklearn.metrics import roc_auc_score

In [13]:
# After training, classifier(clf_RF) can predict the class label of a given observation
# Or it can predict the probability that the given observations belong to each class
# We use the predicted probability.
print("The AUC of the classifier on 1991~2001 training samples is %.3f" 
       %(roc_auc_score(fraud_91_01, clf_RF.predict_proba(feature_91_01)[:,1])))
print("The AUC of the classifier on 2003 test year is %.3f" 
       %(roc_auc_score(fraud_03, clf_RF.predict_proba(feature_03)[:,1])))

The AUC of the classifier on 1991~2001 training samples is 1.000
The AUC of the classifier on 2003 test year is 0.707


In [None]:
# Apart from AUC, the paper also proposes three other metrics to evaluate
# classifier's performance: NDCG@K, Sensitivity, and Precision
# For the sake of code clarity, we will only use AUC for the moment being
# The next step is about imputing missing value