In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#Read data from csv
train = pd.read_csv('../input/detecting-accounting-errors/Restate_int_train.csv')
test = pd.read_csv('../input/detecting-accounting-errors/Restate_int_test.csv')
external = pd.read_csv('../input/smu-forcasting/WRDS.csv') #external data
data_dict = pd.read_excel('../input/smu-forcasting/DataDictionary_v2.xlsx') #list of features that we are going to use 

In [3]:
data_dict.head()

Unnamed: 0,Formula,Name,Account
0,gvkey,Global Company Key,
1,fyear,Data Year - Fiscal,
2,acominc,Accumulated Other Comprehensive Income (Loss),Comprehensive Income
3,aedi,Accrued Expenses and Deferred Income,Accrued Expenses
4,act,Current Assets - Total,Assets


# 1. Exploratory Data Analysis

## 1.1 Find percentage of missing data

In [4]:
data_dict.head()

Unnamed: 0,Formula,Name,Account
0,gvkey,Global Company Key,
1,fyear,Data Year - Fiscal,
2,acominc,Accumulated Other Comprehensive Income (Loss),Comprehensive Income
3,aedi,Accrued Expenses and Deferred Income,Accrued Expenses
4,act,Current Assets - Total,Assets


In [5]:
external.head()

Unnamed: 0,GVKEY,datadate,fyear,indfmt,consol,popsrc,datafmt,conm,curcd,acominc,act,aedi,am,ap,aqc,artfs,at,bkvlps,capx,ceq,ceqt,ch,che,ci,citotal,clt,cogs,cstk,dc,dlcch,dltt,dp,dpact,dptb,dptc,drc,drlt,dv,dvc,dvpd,...,ivpt,ivst,lcat,lct,lt,ni,opiti,optex,ppent,pstk,pvt,re,rect,revt,rveqt,rvti,seq,tstk,ui,urect,utfdoc,wcap,xacc,xlr,xpp,xt,costat,dvpsp_f,dvpsx_f,au,auop,auopic,conml,ggroup,gind,gsector,gsubind,sic,spcsrc,ipodate
0,1004,20050531,2004,INDL,C,D,STD,AAR CORP,USD,-19.779,474.542,,,77.015,0.0,,732.23,9.6589,13.033,314.744,267.491,40.508,50.338,,10.566,,598.172,35.853,2.878,,227.159,27.95,116.067,,,0.0,8.455,0.0,0.0,,...,,9.83,,160.025,417.486,15.453,,3.414,139.137,0.0,,142.45,127.121,747.848,,,314.744,50.497,,,,314.517,79.265,,,,A,0.0,0.0,6.0,1.0,1.0,AAR Corp,2010.0,201010.0,20.0,20101010.0,5080,B,19880101.0
1,1004,20060531,2005,INDL,C,D,STD,AAR CORP,USD,-13.842,624.454,,,97.002,0.0,,978.819,11.5326,16.296,422.717,375.928,121.738,121.738,,41.1,,704.081,40.789,5.956,,318.576,29.222,129.896,,,0.0,7.994,0.0,0.0,,...,,0.0,,187.788,556.102,35.163,,3.08,213.38,0.0,,183.55,136.272,897.284,,,422.717,69.664,,,,436.666,88.497,,,,A,0.0,0.0,6.0,1.0,1.0,AAR Corp,2010.0,201010.0,20.0,20101010.0,5080,B,19880101.0
2,1004,20070531,2006,INDL,C,D,STD,AAR CORP,USD,-13.899,645.721,,,110.239,38.478,,1067.633,13.0998,29.891,494.243,418.105,83.317,83.317,,62.14,,837.171,42.23,5.327,,253.611,32.199,145.518,,,0.0,7.533,0.0,0.0,,...,,0.0,,256.506,573.39,58.66,,2.038,260.167,0.0,,242.153,181.691,1061.169,,,494.243,79.813,,,,389.215,72.022,,,,A,0.0,0.0,6.0,1.0,1.0,AAR Corp,2010.0,201010.0,20.0,20101010.0,5080,B,19880101.0
3,1004,20080531,2007,INDL,C,D,STD,AAR CORP,USD,-13.012,783.431,,,99.073,85.21,,1362.01,15.0944,30.334,585.255,454.089,109.391,112.435,,76.031,,1080.895,43.932,10.294,,507.918,39.952,166.07,,,0.0,7.071,0.0,0.0,,...,,3.044,,218.499,776.755,75.144,,1.271,310.393,0.0,,318.184,202.472,1384.919,,,585.255,100.935,,,,564.932,81.909,,,,A,0.0,0.0,6.0,1.0,1.0,AAR Corp,2010.0,201010.0,20.0,20101010.0,5080,B,19880101.0
4,1004,20090531,2008,INDL,C,D,STD,AAR CORP,USD,-23.996,851.312,,4.852,100.651,0.0,,1377.511,16.8937,27.535,656.895,505.9,112.505,112.505,,67.667,,1110.677,44.201,6.503,,392.984,40.551,174.873,,,0.0,6.61,0.0,0.0,,...,,0.0,,254.418,720.616,78.651,,0.945,245.586,0.0,,385.851,227.3,1423.976,,,656.895,103.159,,,,596.894,83.685,,,,A,0.0,0.0,6.0,1.0,1.0,AAR Corp,2010.0,201010.0,20.0,20101010.0,5080,B,19880101.0


In [6]:
#Now we only take the list of features from the DataDictionary_v2.xlsx
feature_list = data_dict['Formula'].values.tolist() #create a list of features
external.rename(columns={'GVKEY': 'gvkey'}, inplace=True)
external = external[feature_list] #filter out
external.rename(columns={'fyear': 'year'}, inplace=True)

#change column name into gvkey and year 


In [7]:

def find_missing(data):
    cnt_missing = data.isnull().sum().values #number of missing values
    total = data.shape[0] # total records
    ratio_missing = cnt_missing/total*100 #percentage of missing
    
    return pd.DataFrame(data ={'missing_count': cnt_missing, 'missing_ratio_percentage' : ratio_missing}, index = data.columns.values)
df_missing = find_missing(external)
df_missing.style.set_properties(**{'text-align': 'left'})
# df_missing.sort_values(['missing_count','missing_ratio_percentage'],ascending = False)

Unnamed: 0,missing_count,missing_ratio_percentage
gvkey,0,0.0
year,0,0.0
acominc,3985,11.2269
aedi,35495,100.0
act,10504,29.5929
am,11745,33.0892
ap,4005,11.2833
aqc,4835,13.6216
artfs,35495,100.0
at,3761,10.5959


In [8]:
#We only take 
df_missing.reset_index(drop= False,inplace = True)
ext_feature = df_missing[df_missing['missing_ratio_percentage']<15]['index'].values #take feature with less than 15% missing values 

In [9]:
#then we are using these feature to merge with our train and test data 
external = external[ext_feature] #filter out these feature
external = external.select_dtypes(exclude=['object']) #for now we dont take categorical features 
train = train.merge(right = external,on =['gvkey','year'],how= 'left')
test = test.merge(right = external,on =['gvkey','year'],how= 'left')

In [10]:
train.shape

(15213, 45)

In [11]:
#convert to date time
train['Date'] =  pd.to_datetime(train['Date'])
test['Date'] =  pd.to_datetime(test['Date'])

train.head()

Unnamed: 0,gvkey,year,Filing,Date,Restate_Int,acominc,ap,aqc,at,bkvlps,capx,ceq,ceqt,ch,che,cogs,cstk,dltt,dp,dv,dvc,dvt,ebit,ebitda,epsfi,epspi,gdwl,gp,intan,invt,ivst,lt,ni,ppent,pstk,re,rect,revt,seq,tstk,dvpsp_f,dvpsx_f,au,auop,auopic
0,1004,2005,0001104659-05-033688,2005-07-22,0,-13.842,97.002,0.0,978.819,11.5326,16.296,422.717,375.928,121.738,121.738,704.081,40.789,318.576,29.222,0.0,0.0,0.0,62.655,91.877,0.94,1.05,44.432,193.203,46.789,323.592,0.0,556.102,35.163,213.38,0.0,183.55,136.272,897.284,422.717,69.664,0.0,0.0,6.0,1.0,1.0
1,1004,2006,0001104659-06-047248,2006-07-17,0,-13.899,110.239,38.478,1067.633,13.0998,29.891,494.243,418.105,83.317,83.317,837.171,42.23,253.611,32.199,0.0,0.0,0.0,86.708,118.907,1.4,1.61,72.687,223.998,76.138,342.593,0.0,573.39,58.66,260.167,0.0,242.153,181.691,1061.169,494.243,79.813,0.0,0.0,6.0,1.0,1.0
2,1004,2007,0001104659-07-055173,2007-07-20,0,-13.012,99.073,85.21,1362.01,15.0944,30.334,585.255,454.089,109.391,112.435,1080.895,43.932,507.918,39.952,0.0,0.0,0.0,128.57,168.522,1.76,2.02,101.52,304.024,131.166,435.608,3.044,776.755,75.144,310.393,0.0,318.184,202.472,1384.919,585.255,100.935,0.0,0.0,6.0,1.0,1.0
3,1004,2008,0001047469-08-008126,2008-07-11,0,-23.996,100.651,0.0,1377.511,16.8937,27.535,656.895,505.9,112.505,112.505,1110.677,44.201,392.984,40.551,0.0,0.0,0.0,125.529,166.08,1.87,2.07,109.751,313.299,150.995,477.424,0.0,720.616,78.651,245.586,0.0,385.851,227.3,1423.976,656.895,103.159,0.0,0.0,6.0,1.0,1.0
4,1004,2009,0001047469-09-006783,2009-07-16,0,-29.646,114.906,193.989,1501.042,18.9167,28.855,746.906,577.478,79.37,79.37,1065.902,44.87,336.191,38.93,0.0,0.0,0.0,95.415,134.345,1.16,1.17,111.544,286.249,169.428,496.904,0.0,754.692,44.628,334.43,0.0,389.641,238.466,1352.151,746.906,104.447,0.0,0.0,6.0,4.0,1.0


In [12]:
train['Restate_Int'].value_counts()

0    14850
1      363
Name: Restate_Int, dtype: int64

In [13]:
#sort by gvkey and year
train.sort_values(['gvkey','year'],inplace = True)
test.sort_values(['gvkey','year'],inplace = True)

In [14]:
train['year'].value_counts()

2005    3112
2009    3111
2006    3017
2008    2998
2007    2975
Name: year, dtype: int64

# 2.Feature engineering

In [15]:
train['Weekday'] = train['Date'].dt.dayofweek
test['Weekday'] = test['Date'].dt.dayofweek
test.head()

Unnamed: 0,gvkey,year,Filing,Date,acominc,ap,aqc,at,bkvlps,capx,ceq,ceqt,ch,che,cogs,cstk,dltt,dp,dv,dvc,dvt,ebit,ebitda,epsfi,epspi,gdwl,gp,intan,invt,ivst,lt,ni,ppent,pstk,re,rect,revt,seq,tstk,dvpsp_f,dvpsx_f,au,auop,auopic,Weekday
0,1004,2010,0001047469-10-006500,2010-07-16,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4
1,1045,2010,0000006201-10-000006,2010-02-17,-2755.0,1156.0,0.0,25088.0,-11.8309,1962.0,-3945.0,-4877.0,168.0,4946.0,18138.0,339.0,9253.0,995.0,0.0,0.0,0.0,308.0,1303.0,-1.41,-1.41,0.0,4032.0,932.0,594.0,4778.0,29033.0,-471.0,15082.0,0.0,-8362.0,738.0,22170.0,-3945.0,367.0,0.0,0.0,4.0,1.0,1.0,2
2,1050,2010,0001193125-10-069639,2010-03-29,-1.608,9.712,0.0,74.791,2.4565,0.654,35.174,16.27,5.792,5.792,106.692,0.144,10.8,1.758,0.0,0.0,0.0,4.64,6.398,0.15,0.15,14.713,33.91,18.904,12.777,0.0,39.617,2.105,5.88,0.0,-7.851,26.772,140.602,35.174,0.356,0.0,0.0,11.0,1.0,0.0,0
3,1072,2010,0000859163-10-000020,2010-05-20,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3
4,1075,2010,0000950123-10-014366,2010-02-19,-159.767,236.354,0.0,12362.703,33.8634,765.152,3683.327,3498.375,110.188,110.188,2125.206,2421.372,3045.794,414.555,216.979,224.305,224.305,723.884,1138.439,3.27,3.28,,1138.439,184.952,202.989,0.0,8587.477,350.053,9393.867,0.0,1264.194,419.518,3263.645,3683.327,2.239,2.1,2.1,5.0,4.0,1.0,4


In [16]:
train['Date_lag'] = train.groupby('gvkey')['Date'].shift(1)
test['Date_lag'] = test.groupby('gvkey')['Date'].shift(1)
train['Date_diff'] = (train['Date'] -train['Date_lag']).dt.days -365
test['Date_diff'] = (test['Date'] -test['Date_lag']).dt.days - 365

In [17]:
train.head()

Unnamed: 0,gvkey,year,Filing,Date,Restate_Int,acominc,ap,aqc,at,bkvlps,capx,ceq,ceqt,ch,che,cogs,cstk,dltt,dp,dv,dvc,dvt,ebit,ebitda,epsfi,epspi,gdwl,gp,intan,invt,ivst,lt,ni,ppent,pstk,re,rect,revt,seq,tstk,dvpsp_f,dvpsx_f,au,auop,auopic,Weekday,Date_lag,Date_diff
0,1004,2005,0001104659-05-033688,2005-07-22,0,-13.842,97.002,0.0,978.819,11.5326,16.296,422.717,375.928,121.738,121.738,704.081,40.789,318.576,29.222,0.0,0.0,0.0,62.655,91.877,0.94,1.05,44.432,193.203,46.789,323.592,0.0,556.102,35.163,213.38,0.0,183.55,136.272,897.284,422.717,69.664,0.0,0.0,6.0,1.0,1.0,4,NaT,
1,1004,2006,0001104659-06-047248,2006-07-17,0,-13.899,110.239,38.478,1067.633,13.0998,29.891,494.243,418.105,83.317,83.317,837.171,42.23,253.611,32.199,0.0,0.0,0.0,86.708,118.907,1.4,1.61,72.687,223.998,76.138,342.593,0.0,573.39,58.66,260.167,0.0,242.153,181.691,1061.169,494.243,79.813,0.0,0.0,6.0,1.0,1.0,0,2005-07-22,-5.0
2,1004,2007,0001104659-07-055173,2007-07-20,0,-13.012,99.073,85.21,1362.01,15.0944,30.334,585.255,454.089,109.391,112.435,1080.895,43.932,507.918,39.952,0.0,0.0,0.0,128.57,168.522,1.76,2.02,101.52,304.024,131.166,435.608,3.044,776.755,75.144,310.393,0.0,318.184,202.472,1384.919,585.255,100.935,0.0,0.0,6.0,1.0,1.0,4,2006-07-17,3.0
3,1004,2008,0001047469-08-008126,2008-07-11,0,-23.996,100.651,0.0,1377.511,16.8937,27.535,656.895,505.9,112.505,112.505,1110.677,44.201,392.984,40.551,0.0,0.0,0.0,125.529,166.08,1.87,2.07,109.751,313.299,150.995,477.424,0.0,720.616,78.651,245.586,0.0,385.851,227.3,1423.976,656.895,103.159,0.0,0.0,6.0,1.0,1.0,4,2007-07-20,-8.0
4,1004,2009,0001047469-09-006783,2009-07-16,0,-29.646,114.906,193.989,1501.042,18.9167,28.855,746.906,577.478,79.37,79.37,1065.902,44.87,336.191,38.93,0.0,0.0,0.0,95.415,134.345,1.16,1.17,111.544,286.249,169.428,496.904,0.0,754.692,44.628,334.43,0.0,389.641,238.466,1352.151,746.906,104.447,0.0,0.0,6.0,4.0,1.0,3,2008-07-11,5.0


In [18]:
train['Week_num'] =train['Date'].dt.week#.dt.date
test['Week_num'] =test['Date'].dt.week#.dt.date
train.head()

Unnamed: 0,gvkey,year,Filing,Date,Restate_Int,acominc,ap,aqc,at,bkvlps,capx,ceq,ceqt,ch,che,cogs,cstk,dltt,dp,dv,dvc,dvt,ebit,ebitda,epsfi,epspi,gdwl,gp,intan,invt,ivst,lt,ni,ppent,pstk,re,rect,revt,seq,tstk,dvpsp_f,dvpsx_f,au,auop,auopic,Weekday,Date_lag,Date_diff,Week_num
0,1004,2005,0001104659-05-033688,2005-07-22,0,-13.842,97.002,0.0,978.819,11.5326,16.296,422.717,375.928,121.738,121.738,704.081,40.789,318.576,29.222,0.0,0.0,0.0,62.655,91.877,0.94,1.05,44.432,193.203,46.789,323.592,0.0,556.102,35.163,213.38,0.0,183.55,136.272,897.284,422.717,69.664,0.0,0.0,6.0,1.0,1.0,4,NaT,,29
1,1004,2006,0001104659-06-047248,2006-07-17,0,-13.899,110.239,38.478,1067.633,13.0998,29.891,494.243,418.105,83.317,83.317,837.171,42.23,253.611,32.199,0.0,0.0,0.0,86.708,118.907,1.4,1.61,72.687,223.998,76.138,342.593,0.0,573.39,58.66,260.167,0.0,242.153,181.691,1061.169,494.243,79.813,0.0,0.0,6.0,1.0,1.0,0,2005-07-22,-5.0,29
2,1004,2007,0001104659-07-055173,2007-07-20,0,-13.012,99.073,85.21,1362.01,15.0944,30.334,585.255,454.089,109.391,112.435,1080.895,43.932,507.918,39.952,0.0,0.0,0.0,128.57,168.522,1.76,2.02,101.52,304.024,131.166,435.608,3.044,776.755,75.144,310.393,0.0,318.184,202.472,1384.919,585.255,100.935,0.0,0.0,6.0,1.0,1.0,4,2006-07-17,3.0,29
3,1004,2008,0001047469-08-008126,2008-07-11,0,-23.996,100.651,0.0,1377.511,16.8937,27.535,656.895,505.9,112.505,112.505,1110.677,44.201,392.984,40.551,0.0,0.0,0.0,125.529,166.08,1.87,2.07,109.751,313.299,150.995,477.424,0.0,720.616,78.651,245.586,0.0,385.851,227.3,1423.976,656.895,103.159,0.0,0.0,6.0,1.0,1.0,4,2007-07-20,-8.0,28
4,1004,2009,0001047469-09-006783,2009-07-16,0,-29.646,114.906,193.989,1501.042,18.9167,28.855,746.906,577.478,79.37,79.37,1065.902,44.87,336.191,38.93,0.0,0.0,0.0,95.415,134.345,1.16,1.17,111.544,286.249,169.428,496.904,0.0,754.692,44.628,334.43,0.0,389.641,238.466,1352.151,746.906,104.447,0.0,0.0,6.0,4.0,1.0,3,2008-07-11,5.0,29
