In [40]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas import Series
# from matplotlib import pyplot as plt
import seaborn as sns
import xgboost as xgb

% matplotlib inline

In [41]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
severityType = pd.read_csv('severity_type.csv')
resourceType = pd.read_csv('resource_type.csv')
logFeature = pd.read_csv('log_feature.csv')
eventType = pd.read_csv('event_type.csv')
sampleSubmission = pd.read_csv('sample_submission.csv')

trainLabels = train['fault_severity'].values
trainValues = train.drop('fault_severity', axis=1).values
testValues = test.values

dfAllValues = np.vstack((trainValues,testValues))
dfAll = pd.DataFrame(dfAllValues, columns =['id', 'location'])
dfAll['location'] = dfAll['location'].str.lstrip('location ').astype(int)

## Table information
print train.info()
print '------------------------'
print test.info()
print '------------------------'
print severityType.info()
print '------------------------'
print resourceType.info()
print '------------------------'
print logFeature.info()
print '------------------------'
print eventType.info()
print '------------------------'
print dfAll.info()

## Confirm Distinct ID count of each tables
print severityType['id'].nunique()
print train['id'].nunique()
print test['id'].nunique()
print logFeature['id'].nunique()
print resourceType['id'].nunique()
print eventType['id'].nunique()
print dfAll['id'].nunique()


dfAll.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7381 entries, 0 to 7380
Data columns (total 3 columns):
id                7381 non-null int64
location          7381 non-null object
fault_severity    7381 non-null int64
dtypes: int64(2), object(1)
memory usage: 230.7+ KB
None
------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 11171 entries, 0 to 11170
Data columns (total 2 columns):
id          11171 non-null int64
location    11171 non-null object
dtypes: int64(1), object(1)
memory usage: 261.8+ KB
None
------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18552 entries, 0 to 18551
Data columns (total 2 columns):
id               18552 non-null int64
severity_type    18552 non-null object
dtypes: int64(1), object(1)
memory usage: 434.8+ KB
None
------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21076 entries, 0 to 21075
Data columns (total 2 columns):
id               21076 non-null int64
resource_type    21076 non-

Unnamed: 0,id,location
0,14121,118
1,9320,91
2,14394,152
3,8218,931
4,14804,120


In [42]:
## create pivot table (logFeature, resourceType, eventType) by index: Id, column: distinct feature values, values: 0/1
## To merge features to Train/Test by ID
logFeatureGroup = logFeature.groupby('id', as_index=False).agg({'log_feature':pd.Series.count})
logFeatureGroup.columns =['id','featureCount']
logFeaturePivot = logFeature.pivot(index='id', columns='log_feature', values='volume')
resourceTypePivot = resourceType.pivot(index='id', columns='resource_type', values='resource_type')
eventTypePivot = eventType.pivot(index='id', columns='event_type', values='event_type')
severityTypePivot = severityType.pivot(index='id', columns='severity_type', values='severity_type')

## fill 0 in NA
logFeaturePivot.fillna(0, inplace=True)
resourceTypePivot.fillna(0, inplace=True)
eventTypePivot.fillna(0, inplace=True)
severityTypePivot.fillna(0, inplace=True)

## fill 1 in exist cells
resourceTypePivot.replace(value=1, to_replace='resource_type', regex=True, inplace=True)
eventTypePivot.replace(value=1, to_replace='event_type', regex=True, inplace=True)
severityTypePivot.replace(value=1, to_replace='severity_type', regex=True, inplace=True)

## id makes one of the column
logFeaturePivot['id'] = logFeaturePivot.index
logFeaturePivot.index = range(0,logFeaturePivot.shape[0])
resourceTypePivot['id'] = resourceTypePivot.index
resourceTypePivot.index = range(0, resourceTypePivot.shape[0])
eventTypePivot['id'] = eventTypePivot.index
eventTypePivot.index = range(0, eventTypePivot.shape[0])
severityTypePivot['id'] = severityTypePivot.index
severityTypePivot.index = range(0, severityTypePivot.shape[0])

logFeaturePivot['FeatureCnt'] = logFeaturePivot.drop('id', axis=1).sum(axis=1)   ## Feature sum
resourceTypePivot['resouceCnt'] = resourceTypePivot.drop('id', axis=1).sum(axis=1) ## ResourceType sum
eventTypePivot['eventCnt'] = eventTypePivot.drop('id', axis=1).sum(axis=1)    ## eventType sum

In [43]:
## Join Features to dfAll Dataset.

dfAll = pd.merge(left=dfAll, right=logFeaturePivot, how='inner', on='id')
dfAll = pd.merge(left=dfAll, right=logFeatureGroup, how='inner', on='id')
dfAll = pd.merge(left=dfAll, right=resourceTypePivot, how='inner', on='id')
dfAll = pd.merge(left=dfAll, right=eventTypePivot, how='inner', on='id')
dfAll = pd.merge(left=dfAll, right=severityTypePivot, how='inner', on='id')

dfAll.head()

Unnamed: 0,id,location,feature 1,feature 10,feature 100,feature 101,feature 102,feature 103,feature 104,feature 105,...,event_type 6,event_type 7,event_type 8,event_type 9,eventCnt,severity_type 1,severity_type 2,severity_type 3,severity_type 4,severity_type 5
0,14121,118,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,1,0,0,0
1,9320,91,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,1,0,0,0
2,14394,152,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,1,0,0,0
3,8218,931,0,0,0,0,0,0,0,0,...,0,0,0,0,2,1,0,0,0,0
4,14804,120,0,0,0,0,0,0,0,0,...,0,0,0,0,4,1,0,0,0,0


In [44]:
## Make Final dfAll, Train, Test Dataset.

trainSize = train.shape[0]
testSize = test.shape[0]

train = dfAll[:trainSize]
train.loc[train.index,'fault_severity'] = trainLabels
trainLocationGroup = train.groupby('location', as_index=False).agg({'id':pd.Series.count,\
                                                         'severity_type 1':pd.Series.sum,\
                                                         'severity_type 2':pd.Series.sum,\
                                                        'severity_type 3':pd.Series.sum,\
                                                        'severity_type 4':pd.Series.sum,\
                                                        'severity_type 5':pd.Series.sum})
trainLocationGroup.columns =['location','severityType2_cnt','severityType5_cnt','severityType4_cnt',\
                             'severityType3_cnt','locAppear','severityType1_cnt']
train = pd.merge(left=train, right=trainLocationGroup, how='left', on='location')
train['severity1_prob']=train['severityType1_cnt']/train['locAppear']
train['severity2_prob']=train['severityType2_cnt']/train['locAppear']
train['severity3_prob']=train['severityType3_cnt']/train['locAppear']
train['severity4_prob']=train['severityType4_cnt']/train['locAppear']
train['severity5_prob']=train['severityType5_cnt']/train['locAppear']


test = dfAll[trainSize:]
test = pd.merge(left=test, right=trainLocationGroup, how='left', on='location')

test['severity1_prob']=test['severityType1_cnt']/test['locAppear']
test['severity2_prob']=test['severityType2_cnt']/test['locAppear']
test['severity3_prob']=test['severityType3_cnt']/test['locAppear']
test['severity4_prob']=test['severityType4_cnt']/test['locAppear']
test['severity5_prob']=test['severityType5_cnt']/test['locAppear']
test = test.fillna(0)


trainId = dfAll['id'][:trainSize] 
testId = dfAll['id'][trainSize:]
dfAllValues = dfAll.drop('id', axis=1).values    ## Use location as a feature!!
# dfAllValues = dfAll.drop(['id','location'], axis=1).values      ## Don't use location as a feature!!
trainValues = dfAllValues[:trainSize]
testValues = dfAllValues[trainSize:]


## export data for matlab
pd.DataFrame(trainValues).to_csv('trainValues.csv')
pd.DataFrame(testValues).to_csv('testValues.csv')
pd.DataFrame(trainLabels).to_csv('trainLabels.csv')
pd.DataFrame(trainId).to_csv('trainId.csv')
pd.DataFrame(testId).to_csv('testId.csv')

In [47]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model as lm
from sklearn import cross_validation
import scipy as sp

le = LabelEncoder()
y = le.fit_transform(trainLabels)   
    
## Random Forest
# rf = RandomForestClassifier(n_estimators=300, max_depth=25,min_samples_split=3, random_state=0)  ## 0.7619(25,3)
# rf.fit(trainValues, y)
# y_pred_rf = rf.predict_proba(testValues)

## Xgboost
xg = xgb.XGBClassifier(max_depth=8, learning_rate=0.1, n_estimators=300)  ## max depth = 7
xg.fit(trainValues, trainLabels)
y_pred_xg = xg.predict_proba(testValues)


## Logistic Regression
# lgr = lm.LogisticRegression(penalty='l2', dual=False,\
#                                       tol=0.0001, C=1.5, fit_intercept=True,\
#                                       intercept_scaling=1, class_weight='balanced',\
#                                       random_state=None)
# lgr.fit(trainValues, y)
# y_pred_logit = lgr.predict_proba(testValues)


## export to csv file
# LogitSubmission = pd.DataFrame({'id':testId, 'predict_0':y_pred_logit[:,0],'predict_1':y_pred_logit[:,1],\
#                                'predict_2':y_pred_logit[:,2]})
# RfSubmission = pd.DataFrame({'id':testId, 'predict_0':y_pred_rf[:,0],'predict_1':y_pred_rf[:,1],\
#                                'predict_2':y_pred_rf[:,2]})

xgSubmission = pd.DataFrame({'id':testId, 'predict_0':y_pred_xg[:,0],'predict_1':y_pred_xg[:,1],\
                               'predict_2':y_pred_xg[:,2]})



# LogitSubmission.to_csv('logit_sub.csv', index=False)
# RfSubmission.to_csv('rf_sub.csv',index=False)
xgSubmission.to_csv('xg_sub.csv',index=False)

# rf_cv = cross_validation.cross_val_score(rf, trainValues, trainLabels, cv=7,\
#                                             n_jobs=1, verbose=0)
# lgr_cv = cross_validation.cross_val_score(lgr, trainValues, trainLabels, cv=7,\
#                                             n_jobs=1, verbose=0)
xg_cv = cross_validation.cross_val_score(xg,trainValues, trainLabels, cv=7,\
                                         n_jobs=1, verbose=0)


# print np.mean(rf_cv)
# print np.mean(lgr_cv)
print np.mean(xg_cv)

0.766839176649


In [9]:
## evaluation function
def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

In [18]:
## Cross-validation
rf_parameter = [range(100,200), range(25,31), range(1,5)] ## n_estimators, max_depth, min_samples_split
lgr_parameter = [range(1,5), range(4,8), range(1,5)]  ## tolerance, C, intercept_scaling
xg_parameter =[5,6,7,8]


scores_rf = []
scores_lgr = []
scores_std_rf = []
scores_std_lgr = []

# for maxDepth in rf_parameter[1]:
#     for minSplit in rf_parameter[2]:
#         rf.max_depth = maxDepth
#         rf.min_samples_split = minSplit
#         cv_score = cross_validation.cross_val_score(rf, trainValues, trainLabels, cv=7,\
#                                             n_jobs=1, verbose=0)

#         print '(' + str(maxDepth) + ',' + str(minSplit) +')' + '--->' + 'cv_score_mean:' + str(cv_score.mean())
#         scores_rf.append(np.mean(cv_score))
#         scores_std_rf.append(np.std(cv_score))    

        
for maxDepth in xg_parameter:
    xg.max_depth = maxDepth
    cv_score = cross_validation.cross_val_score(xg, trainValues, trainLabels, cv=7,\
                                            n_jobs=1, verbose=0)

    print '(' + str(maxDepth) + ')' + '--->' + 'cv_score_mean:' + str(cv_score.mean())
    scores_rf.append(np.mean(cv_score))
    scores_std_rf.append(np.std(cv_score))    


        
# for tol in lgr_parameter[0]:
#     for c in lgr_parameter[1]:
#         lgr.tol = 0.01*tol
#         lgr.C = 1+c
#         cv_score = cross_validation.cross_val_score(lgr, trainValues, trainLabels, cv=7,\
#                                             n_jobs=1, verbose=0)

#         print '(' + str(tol) + ',' + str(c) +')' + '--->' + 'cv_score_mean:' + str(cv_score.mean())
#         scores_lgr.append(np.mean(cv_score))
#         scores_std_lgr.append(np.std(cv_score))

(5)--->cv_score_mean:0.75532214966
(6)--->cv_score_mean:0.75328740725
(7)--->cv_score_mean:0.753423069909
(8)--->cv_score_mean:0.753427055719


In [6]:
from sklearn.decomposition import PCA, KernelPCA

## Kernel PCA for dimension reduction
# kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
# trainValues_kpca = kpca.fit_transform(trainValues)
# trainValues_back = kpca.inverse_transform(trainValues_kpca)


## PCA for dimension reduction
pca = PCA(n_components=200)
trainValues_pca = pca.fit_transform(trainValues)
testValues_pca = pca.fit_transform(testValues)

In [45]:
test

Unnamed: 0,id,location,feature 1,feature 10,feature 100,feature 101,feature 102,feature 103,feature 104,feature 105,...,severityType5_cnt,severityType4_cnt,severityType3_cnt,locAppear,severityType1_cnt,severity1_prob,severity2_prob,severity3_prob,severity4_prob,severity5_prob
0,11066,481,0,0,0,0,0,0,0,0,...,0,0,0,20,2,0.100000,0.900000,0.0,0.000000,0
1,18000,962,0,0,0,0,0,0,0,0,...,0,0,0,45,13,0.288889,0.711111,0.0,0.000000,0
2,16964,491,0,0,0,0,0,0,0,0,...,0,0,0,15,1,0.066667,0.933333,0.0,0.000000,0
3,4795,532,0,0,0,0,0,0,0,0,...,0,1,0,3,2,0.666667,0.000000,0.0,0.333333,0
4,3392,600,0,0,0,0,0,0,0,0,...,0,0,0,64,9,0.140625,0.859375,0.0,0.000000,0
5,3795,794,0,0,0,0,0,0,0,0,...,0,0,0,36,36,1.000000,0.000000,0.0,0.000000,0
6,2881,375,0,0,0,0,0,0,0,0,...,0,0,0,11,0,0.000000,1.000000,0.0,0.000000,0
7,1903,638,0,0,0,0,0,0,0,0,...,0,3,0,25,22,0.880000,0.000000,0.0,0.120000,0
8,5245,690,0,0,0,0,0,0,0,0,...,0,0,0,10,9,0.900000,0.100000,0.0,0.000000,0
9,6726,893,0,0,0,0,0,0,0,0,...,0,0,0,20,20,1.000000,0.000000,0.0,0.000000,0
