In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats.mstats import mode

%matplotlib inline
from matplotlib.pyplot import rcParams
rcParams["figure.figsize"] = 12 , 4

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
event_type = pd.read_csv("data/event_type.csv")
log_feature = pd.read_csv("data/log_feature.csv")
resource_type = pd.read_csv("data/resource_type.csv")
severity_type = pd.read_csv("data/severity_type.csv")

In [4]:
event_type["id"].unique()

array([6597, 8011, 2597, ..., 6488,  878, 4464])

In [5]:
log_feature["id"].unique()

array([6597, 8011, 2597, ..., 6488,  878, 4464])

In [6]:
resource_type["id"].unique()

array([6597, 8011, 2597, ..., 6488,  878, 4464])

In [7]:
severity_type["id"].unique()

array([6597, 8011, 2597, ..., 6488,  878, 4464])

In [8]:
train["id"].unique()

array([14121,  9320, 14394, ..., 14111, 15189, 17067])

In [9]:
severity_type.shape

(18552, 2)

In [10]:
severity_type

Unnamed: 0,id,severity_type
0,6597,severity_type 2
1,8011,severity_type 2
2,2597,severity_type 2
3,5022,severity_type 1
4,6852,severity_type 1
5,5611,severity_type 2
6,14838,severity_type 1
7,2588,severity_type 1
8,4848,severity_type 1
9,6914,severity_type 1


In [3]:
train.head()

Unnamed: 0,id,location,fault_severity
0,14121,location 118,1
1,9320,location 91,0
2,14394,location 152,1
3,8218,location 931,1
4,14804,location 120,0


In [4]:
train["source"] = "train"
test["source"] = "test"
data = pd.concat([train , test] , ignore_index=True)

In [5]:
data.head()

Unnamed: 0,fault_severity,id,location,source
0,1.0,14121,location 118,train
1,0.0,9320,location 91,train
2,1.0,14394,location 152,train
3,1.0,8218,location 931,train
4,0.0,14804,location 120,train


In [6]:
print train.shape
print test.shape
print data.shape
print event_type.shape
print log_feature.shape
print resource_type.shape
print severity_type.shape

(7381, 4)
(11171, 3)
(18552, 4)
(31170, 2)
(58671, 3)
(21076, 2)
(18552, 2)


In [7]:
print len(data["location"].unique())

1126


In [8]:
data["fault_severity"].value_counts()

0.0    4784
1.0    1871
2.0     726
Name: fault_severity, dtype: int64

In [9]:
log_feature.head()

Unnamed: 0,id,log_feature,volume
0,6597,feature 68,6
1,8011,feature 68,7
2,2597,feature 68,1
3,5022,feature 172,2
4,5022,feature 56,1


In [10]:
resource_type.head()

Unnamed: 0,id,resource_type
0,6597,resource_type 8
1,8011,resource_type 8
2,2597,resource_type 8
3,5022,resource_type 8
4,6852,resource_type 8


In [11]:
severity_type.head()

Unnamed: 0,id,severity_type
0,6597,severity_type 2
1,8011,severity_type 2
2,2597,severity_type 2
3,5022,severity_type 1
4,6852,severity_type 1


In [12]:
event_type.head()

Unnamed: 0,id,event_type
0,6597,event_type 11
1,8011,event_type 15
2,2597,event_type 15
3,5022,event_type 15
4,5022,event_type 11


In [16]:
event_type["event_type"].unique()
len(event_type["id"].unique())


18552

In [17]:
event_type = event_type.merge(data , on="id")

In [18]:
event_type.head()

Unnamed: 0,id,event_type,fault_severity,location,source
0,6597,event_type 11,,location 1,test
1,8011,event_type 15,0.0,location 1,train
2,2597,event_type 15,,location 1,test
3,5022,event_type 15,,location 1,test
4,5022,event_type 11,,location 1,test


In [19]:
event_type_unq = pd.DataFrame(event_type["event_type"].value_counts())
event_type_unq.head()

Unnamed: 0,event_type
event_type 11,7888
event_type 35,6615
event_type 34,5927
event_type 15,4395
event_type 20,1458


In [28]:
pivot_test = event_type.pivot_table(values = "source" , index = "event_type" , aggfunc= lambda x: sum(x == "train"))
pivot_test.head()

Unnamed: 0_level_0,source
event_type,Unnamed: 1_level_1
event_type 1,1
event_type 10,58
event_type 11,3068
event_type 12,2
event_type 13,247


In [29]:
# Determine % of training samples in certain event_type
event_type_unq["PercTrain"] = event_type.pivot_table(values = "source" , index = "event_type" , aggfunc = lambda x: sum(x == "train") / float(len(x)))
event_type_unq.head()

Unnamed: 0,event_type,PercTrain
event_type 11,7888,0.388945
event_type 35,6615,0.407105
event_type 34,5927,0.406783
event_type 15,4395,0.392264
event_type 20,1458,0.38203


In [36]:
pivot_test2 = event_type.loc[event_type["source"] == "train"].pivot_table(values = "fault_severity", index = "event_type" , aggfunc = mode)
pivot_test2.head()

Unnamed: 0_level_0,fault_severity
event_type,Unnamed: 1_level_1
event_type 1,"([0.0], [1.0])"
event_type 10,"([1.0], [30.0])"
event_type 11,"([0.0], [1677.0])"
event_type 12,"([0.0], [1.0])"
event_type 13,"([1.0], [136.0])"


In [37]:
help(mode)

Help on function mode in module scipy.stats.mstats_basic:

mode(a, axis=0)
    Returns an array of the modal (most common) value in the passed array.
    
    Parameters
    ----------
    a : array_like
        n-dimensional array of which to find mode(s).
    axis : int or None, optional
        Axis along which to operate. Default is 0. If None, compute over
        the whole array `a`.
    
    Returns
    -------
    mode : ndarray
        Array of modal values.
    count : ndarray
        Array of counts for each mode.
    
    Notes
    -----
    For more details, see `stats.mode`.



In [38]:
event_type_unq["Mode_Severity"] = event_type.loc[event_type["source"] == "train"].pivot_table(values = "fault_severity" , index = "event_type" , aggfunc = lambda x:mode(x).mode[0])

In [190]:
event_type_unq.head()

Unnamed: 0,event_type,PercTrain,Mode_Severity,preprocess
event_type 11,7888,0.388945,0.0,event_type 11
event_type 35,6615,0.407105,0.0,event_type 35
event_type 34,5927,0.406783,0.0,event_type 34
event_type 15,4395,0.392264,0.0,event_type 15
event_type 20,1458,0.38203,0.0,event_type 20


In [191]:
event_type_unq.dtypes

event_type         int64
PercTrain        float64
Mode_Severity    float64
preprocess        object
dtype: object

In [193]:
event_type_unq.iloc[:33]

Unnamed: 0,event_type,PercTrain,Mode_Severity,preprocess
event_type 11,7888,0.388945,0.0,event_type 11
event_type 35,6615,0.407105,0.0,event_type 35
event_type 34,5927,0.406783,0.0,event_type 34
event_type 15,4395,0.392264,0.0,event_type 15
event_type 20,1458,0.38203,0.0,event_type 20
event_type 54,684,0.385965,0.0,event_type 54
event_type 13,582,0.424399,1.0,event_type 13
event_type 42,478,0.387029,0.0,event_type 42
event_type 44,466,0.381974,0.0,event_type 44
event_type 23,429,0.461538,0.0,event_type 23


In [194]:
event_type_unq.iloc[:50]

Unnamed: 0,event_type,PercTrain,Mode_Severity,preprocess
event_type 11,7888,0.388945,0.0,event_type 11
event_type 35,6615,0.407105,0.0,event_type 35
event_type 34,5927,0.406783,0.0,event_type 34
event_type 15,4395,0.392264,0.0,event_type 15
event_type 20,1458,0.38203,0.0,event_type 20
event_type 54,684,0.385965,0.0,event_type 54
event_type 13,582,0.424399,1.0,event_type 13
event_type 42,478,0.387029,0.0,event_type 42
event_type 44,466,0.381974,0.0,event_type 44
event_type 23,429,0.461538,0.0,event_type 23


In [40]:
event_type_unq.iloc[-15:]

Unnamed: 0,event_type,PercTrain,Mode_Severity
event_type 53,17,0.294118,2.0
event_type 9,14,0.214286,0.0
event_type 19,14,0.285714,2.0
event_type 31,10,0.5,1.0
event_type 37,10,0.4,0.0
event_type 12,6,0.333333,0.0
event_type 25,5,0.2,0.0
event_type 1,4,0.25,0.0
event_type 51,4,0.75,0.0
event_type 17,3,0.0,


In [43]:
event_type_unq["preprocess"] = event_type_unq.index.values

In [44]:
event_type_unq.head()

Unnamed: 0,event_type,PercTrain,Mode_Severity,preprocess
event_type 11,7888,0.388945,0.0,event_type 11
event_type 35,6615,0.407105,0.0,event_type 35
event_type 34,5927,0.406783,0.0,event_type 34
event_type 15,4395,0.392264,0.0,event_type 15
event_type 20,1458,0.38203,0.0,event_type 20


In [45]:
top_unchange = 33
event_type_unq["preprocess"].iloc[top_unchange:] = event_type_unq["Mode_Severity"].iloc[top_unchange:].apply(lambda x: "Remove" if pd.isnull(x) else "event_type others_%d"%int(x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [50]:
event_type_unq.iloc[-15:]

Unnamed: 0,event_type,PercTrain,Mode_Severity,preprocess
event_type 53,17,0.294118,2.0,event_type others_2
event_type 9,14,0.214286,0.0,event_type others_0
event_type 19,14,0.285714,2.0,event_type others_2
event_type 31,10,0.5,1.0,event_type others_1
event_type 37,10,0.4,0.0,event_type others_0
event_type 12,6,0.333333,0.0,event_type others_0
event_type 25,5,0.2,0.0,event_type others_0
event_type 1,4,0.25,0.0,event_type others_0
event_type 51,4,0.75,0.0,event_type others_0
event_type 17,3,0.0,,Remove


In [51]:
event_type_unq["preprocess"].value_counts()

event_type others_0    10
event_type others_2     5
Remove                  4
event_type 10           1
event_type 27           1
event_type 54           1
event_type 26           1
event_type 50           1
event_type 28           1
event_type 47           1
event_type 8            1
event_type 43           1
event_type 42           1
event_type 34           1
event_type 35           1
event_type 21           1
event_type 20           1
event_type 23           1
event_type 45           1
event_type 46           1
event_type 11           1
event_type 30           1
event_type 40           1
event_type 29           1
event_type 14           1
event_type 15           1
event_type 18           1
event_type 32           1
event_type 22           1
event_type 2            1
event_type 24           1
event_type 5            1
event_type 6            1
event_type 7            1
event_type 13           1
event_type others_1     1
event_type 44           1
Name: preprocess, dtype: int64

In [52]:
event_type_unq

Unnamed: 0,event_type,PercTrain,Mode_Severity,preprocess
event_type 11,7888,0.388945,0.0,event_type 11
event_type 35,6615,0.407105,0.0,event_type 35
event_type 34,5927,0.406783,0.0,event_type 34
event_type 15,4395,0.392264,0.0,event_type 15
event_type 20,1458,0.38203,0.0,event_type 20
event_type 54,684,0.385965,0.0,event_type 54
event_type 13,582,0.424399,1.0,event_type 13
event_type 42,478,0.387029,0.0,event_type 42
event_type 44,466,0.381974,0.0,event_type 44
event_type 23,429,0.461538,0.0,event_type 23


In [53]:
event_type.head()

Unnamed: 0,id,event_type,fault_severity,location,source
0,6597,event_type 11,,location 1,test
1,8011,event_type 15,0.0,location 1,train
2,2597,event_type 15,,location 1,test
3,5022,event_type 15,,location 1,test
4,5022,event_type 11,,location 1,test


In [54]:
event_type = event_type.merge(event_type_unq[["preprocess"]] , left_on="event_type" , right_index=True)

In [55]:
event_type.head()

Unnamed: 0,id,event_type,fault_severity,location,source,preprocess
0,6597,event_type 11,,location 1,test,event_type 11
4,5022,event_type 11,,location 1,test,event_type 11
5,6852,event_type 11,,location 1,test,event_type 11
9,14838,event_type 11,,location 1,test,event_type 11
11,2588,event_type 11,0.0,location 1,train,event_type 11


In [56]:
event_type["preprocess"].value_counts()

event_type 11          7888
event_type 35          6615
event_type 34          5927
event_type 15          4395
event_type 20          1458
event_type 54           684
event_type 13           582
event_type 42           478
event_type 44           466
event_type 23           429
event_type 14           330
event_type 43           306
event_type 22           223
event_type 50           154
event_type 10           145
event_type 21           136
event_type others_2      85
event_type others_0      84
event_type 18            73
event_type 47            69
event_type 26            65
event_type 32            63
event_type 30            60
event_type 45            53
event_type 24            46
event_type 27            44
event_type 29            42
event_type 40            40
event_type 46            38
event_type 2             37
event_type 28            32
event_type 8             29
event_type 6             28
event_type 5             26
event_type 7             24
event_type others_1 

In [57]:
event_type_merge = event_type.pivot_table(values="event_type" , index = "id" , columns="preprocess" , aggfunc=lambda x: len(x) , fill_value=0)

In [195]:
event_type_merge.shape

(18552, 37)

In [197]:
event_type.pivot_table(values="event_type" , index = "id" , aggfunc=lambda x:len(x))

Unnamed: 0_level_0,event_type
id,Unnamed: 1_level_1
1,2
2,2
3,1
4,1
5,2
6,1
7,2
8,2
9,2
10,1


In [59]:
event_type_merge.head()

preprocess,Remove,event_type 10,event_type 11,event_type 13,event_type 14,event_type 15,event_type 18,event_type 2,event_type 20,event_type 21,...,event_type 47,event_type 5,event_type 50,event_type 54,event_type 6,event_type 7,event_type 8,event_type others_0,event_type others_1,event_type others_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
data = data.merge(event_type_merge , left_on="id" , right_index=True)

In [61]:
data.head()

Unnamed: 0,fault_severity,id,location,source,Remove,event_type 10,event_type 11,event_type 13,event_type 14,event_type 15,...,event_type 47,event_type 5,event_type 50,event_type 54,event_type 6,event_type 7,event_type 8,event_type others_0,event_type others_1,event_type others_2
0,1.0,14121,location 118,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,9320,location 91,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,14394,location 152,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,8218,location 931,train,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0.0,14804,location 120,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [62]:
data.shape

(18552, 41)

# log feature

In [63]:
log_feature.head()

Unnamed: 0,id,log_feature,volume
0,6597,feature 68,6
1,8011,feature 68,7
2,2597,feature 68,1
3,5022,feature 172,2
4,5022,feature 56,1


In [64]:
log_feature["log_feature"].value_counts()

feature 312    5267
feature 232    4754
feature 82     3472
feature 203    2823
feature 313    2145
feature 233    1901
feature 307    1597
feature 54     1573
feature 170    1526
feature 71     1514
feature 315    1495
feature 134    1419
feature 80     1336
feature 235    1294
feature 193    1160
feature 219    1152
feature 68     1093
feature 227    1080
feature 314     950
feature 201     902
feature 234     882
feature 73      868
feature 195     783
feature 301     707
feature 309     627
feature 55      564
feature 229     560
feature 273     491
feature 308     484
feature 368     462
               ... 
feature 16        1
feature 271       1
feature 175       1
feature 272       1
feature 263       1
feature 254       1
feature 257       1
feature 250       1
feature 252       1
feature 382       1
feature 386       1
feature 385       1
feature 96        1
feature 3         1
feature 144       1
feature 260       1
feature 262       1
feature 17        1
feature 379       1


In [65]:
log_feature.shape

(58671, 3)

In [67]:
log_feature = log_feature.merge(data[["id" , "fault_severity" , "source"]] , on = "id")

In [68]:
log_feature.head()

Unnamed: 0,id,log_feature,volume,fault_severity,source
0,6597,feature 68,6,,test
1,8011,feature 68,7,0.0,train
2,2597,feature 68,1,,test
3,5022,feature 172,2,,test
4,5022,feature 56,1,,test


In [69]:
log_feature_unq = pd.DataFrame(log_feature["log_feature"].value_counts())
log_feature_unq.head()

Unnamed: 0,log_feature
feature 312,5267
feature 232,4754
feature 82,3472
feature 203,2823
feature 313,2145


In [70]:
log_feature_unq["PercTrain"] = log_feature.pivot_table(values="source" , index = "log_feature" , aggfunc=lambda x: sum(x == "train")/float(len(x)))
log_feature_unq.head()

Unnamed: 0,log_feature,PercTrain
feature 312,5267,0.41143
feature 232,4754,0.408919
feature 82,3472,0.393433
feature 203,2823,0.393199
feature 313,2145,0.421445


In [71]:
log_feature_unq["Mode_Severity"] = log_feature.loc[log_feature["source"] == "train"].pivot_table(values="fault_severity" , index = "log_feature" , aggfunc=lambda x: mode(x).mode[0])
log_feature_unq.head()

Unnamed: 0,log_feature,PercTrain,Mode_Severity
feature 312,5267,0.41143,0.0
feature 232,4754,0.408919,0.0
feature 82,3472,0.393433,2.0
feature 203,2823,0.393199,2.0
feature 313,2145,0.421445,0.0


In [72]:
len(log_feature_unq)

386

In [74]:
log_feature_unq.iloc[100:130]

Unnamed: 0,log_feature,PercTrain,Mode_Severity
feature 318,44,0.477273,0.0
feature 66,42,0.238095,1.0
feature 132,42,0.404762,1.0
feature 83,40,0.5,2.0
feature 218,40,0.5,1.0
feature 240,39,0.333333,0.0
feature 188,38,0.263158,1.0
feature 284,38,0.526316,0.0
feature 239,38,0.5,0.0
feature 63,37,0.378378,0.0


In [75]:
log_feature_unq["preprocess"] = log_feature_unq.index.values

In [76]:
log_feature_unq["preprocess"].loc[log_feature_unq["PercTrain"] == 1] = np.nan

In [77]:
top_unchange = 128
log_feature_unq["preprocess"].iloc[top_unchange:] = log_feature_unq["Mode_Severity"].iloc[top_unchange:].apply(lambda x: "Remove" if pd.isnull(x) else "feature others_%d"%int(x))

In [78]:
log_feature_unq["preprocess"].value_counts()

feature others_0    111
feature others_1     64
Remove               55
feature others_2     28
feature 203           1
feature 83            1
feature 362           1
feature 81            1
feature 360           1
feature 309           1
feature 172           1
feature 202           1
feature 85            1
feature 171           1
feature 368           1
feature 179           1
feature 86            1
feature 207           1
feature 201           1
feature 206           1
feature 205           1
feature 204           1
feature 135           1
feature 134           1
feature 209           1
feature 133           1
feature 132           1
feature 289           1
feature 75            1
feature 74            1
                   ... 
feature 316           1
feature 318           1
feature 76            1
feature 354           1
feature 95            1
feature 94            1
feature 8             1
feature 167           1
feature 163           1
feature 160           1
feature 161     

In [79]:
log_feature_unq

Unnamed: 0,log_feature,PercTrain,Mode_Severity,preprocess
feature 312,5267,0.411430,0.0,feature 312
feature 232,4754,0.408919,0.0,feature 232
feature 82,3472,0.393433,2.0,feature 82
feature 203,2823,0.393199,2.0,feature 203
feature 313,2145,0.421445,0.0,feature 313
feature 233,1901,0.423987,0.0,feature 233
feature 307,1597,0.420789,0.0,feature 307
feature 54,1573,0.399237,1.0,feature 54
feature 170,1526,0.396461,1.0,feature 170
feature 71,1514,0.408851,0.0,feature 71


In [80]:
log_feature.head()

Unnamed: 0,id,log_feature,volume,fault_severity,source
0,6597,feature 68,6,,test
1,8011,feature 68,7,0.0,train
2,2597,feature 68,1,,test
3,5022,feature 172,2,,test
4,5022,feature 56,1,,test


In [81]:
log_feature = log_feature.merge(log_feature_unq[["preprocess"]] , left_on="log_feature" , right_index=True)

In [82]:
event_type.head()

Unnamed: 0,id,event_type,fault_severity,location,source,preprocess
0,6597,event_type 11,,location 1,test,event_type 11
4,5022,event_type 11,,location 1,test,event_type 11
5,6852,event_type 11,,location 1,test,event_type 11
9,14838,event_type 11,,location 1,test,event_type 11
11,2588,event_type 11,0.0,location 1,train,event_type 11


In [83]:
log_feature.head()

Unnamed: 0,id,log_feature,volume,fault_severity,source,preprocess
0,6597,feature 68,6,,test,feature 68
1,8011,feature 68,7,0.0,train,feature 68
2,2597,feature 68,1,,test,feature 68
23,6914,feature 68,11,0.0,train,feature 68
41,16416,feature 68,4,,test,feature 68


In [84]:
log_feature_merge = log_feature.pivot_table(values="volume" ,index = "id" , columns="preprocess" , aggfunc=np.sum ,fill_value=0)

In [85]:
log_feature_merge.head()

preprocess,Remove,feature 101,feature 103,feature 105,feature 109,feature 118,feature 132,feature 133,feature 134,feature 135,...,feature 82,feature 83,feature 85,feature 86,feature 87,feature 94,feature 95,feature others_0,feature others_1,feature others_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
log_feature_merge.sum()

preprocess
Remove                277
feature 101          4643
feature 103          5900
feature 105          2265
feature 109          3880
feature 118            43
feature 132            96
feature 133           117
feature 134          1885
feature 135           150
feature 153           166
feature 154           101
feature 155           391
feature 157            80
feature 160           359
feature 161            36
feature 163           291
feature 167           300
feature 170         11697
feature 171          2154
feature 172          4451
feature 179           793
feature 181           122
feature 182           166
feature 188            85
feature 191          1288
feature 193          5572
feature 195          6697
feature 196           636
feature 197           235
                    ...  
feature 44            334
feature 47            247
feature 51           8665
feature 52            362
feature 54          22217
feature 55           2406
feature 56           8829
f

In [88]:
log_feature_merge.sum().sum()

568246L

In [89]:
log_feature_merge.shape

(18552, 132)

In [90]:
data = data.merge(log_feature_merge , left_on="id" , right_index=True)

In [91]:
data.head()

Unnamed: 0,fault_severity,id,location,source,Remove_x,event_type 10,event_type 11,event_type 13,event_type 14,event_type 15,...,feature 82,feature 83,feature 85,feature 86,feature 87,feature 94,feature 95,feature others_0,feature others_1,feature others_2
0,1.0,14121,location 118,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,9320,location 91,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,14394,location 152,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,8218,location 931,train,0,0,1,0,0,1,...,12,0,0,0,0,0,0,0,0,0
4,0.0,14804,location 120,train,0,0,1,0,0,0,...,0,0,0,0,0,0,0,4,0,0


In [92]:
data.loc[data["source"] == "test"]

Unnamed: 0,fault_severity,id,location,source,Remove_x,event_type 10,event_type 11,event_type 13,event_type 14,event_type 15,...,feature 82,feature 83,feature 85,feature 86,feature 87,feature 94,feature 95,feature others_0,feature others_1,feature others_2
7381,,11066,location 481,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7382,,18000,location 962,test,0,0,1,0,0,1,...,20,0,0,0,0,0,0,0,0,0
7383,,16964,location 491,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7384,,4795,location 532,test,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7385,,3392,location 600,test,0,0,0,0,0,1,...,6,0,0,0,0,0,0,0,0,0
7386,,3795,location 794,test,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7387,,2881,location 375,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7388,,1903,location 638,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7389,,5245,location 690,test,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7390,,6726,location 893,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
data.dtypes

fault_severity      float64
id                    int64
location             object
source               object
Remove_x              int64
event_type 10         int64
event_type 11         int64
event_type 13         int64
event_type 14         int64
event_type 15         int64
event_type 18         int64
event_type 2          int64
event_type 20         int64
event_type 21         int64
event_type 22         int64
event_type 23         int64
event_type 24         int64
event_type 26         int64
event_type 27         int64
event_type 28         int64
event_type 29         int64
event_type 30         int64
event_type 32         int64
event_type 34         int64
event_type 35         int64
event_type 40         int64
event_type 42         int64
event_type 43         int64
event_type 44         int64
event_type 45         int64
                     ...   
feature 44            int64
feature 47            int64
feature 51            int64
feature 52            int64
feature 54          

## Resource Type

In [95]:
resource_type.head()

Unnamed: 0,id,resource_type
0,6597,resource_type 8
1,8011,resource_type 8
2,2597,resource_type 8
3,5022,resource_type 8
4,6852,resource_type 8


In [96]:
resource_type["resource_type"].value_counts()

resource_type 8     10268
resource_type 2      8918
resource_type 6       582
resource_type 7       498
resource_type 4       330
resource_type 9       190
resource_type 3       145
resource_type 10       73
resource_type 1        58
resource_type 5        14
Name: resource_type, dtype: int64

In [97]:
resource_type.shape

(21076, 2)

In [98]:
resource_type = resource_type.merge(data[["id" , "fault_severity" , "source"]] , on = "id")

In [99]:
resource_type.head()

Unnamed: 0,id,resource_type,fault_severity,source
0,6597,resource_type 8,,test
1,8011,resource_type 8,0.0,train
2,2597,resource_type 8,,test
3,5022,resource_type 8,,test
4,6852,resource_type 8,,test


In [100]:
resource_type_unq = pd.DataFrame(resource_type["resource_type"].value_counts())

In [101]:
resource_type_unq.head()

Unnamed: 0,resource_type
resource_type 8,10268
resource_type 2,8918
resource_type 6,582
resource_type 7,498
resource_type 4,330


In [102]:
resource_type_unq["PercTrain"] = resource_type.pivot_table(values = "source" , index = "resource_type" , aggfunc=lambda x: sum(x == "train")/float(len(x)))

In [103]:
resource_type_unq.head()

Unnamed: 0,resource_type,PercTrain
resource_type 8,10268,0.394527
resource_type 2,8918,0.401996
resource_type 6,582,0.424399
resource_type 7,498,0.451807
resource_type 4,330,0.436364


In [107]:
resource_type_unq["Mode_Severity"] = resource_type.loc[resource_type["source"] == "train"].pivot_table(values="fault_severity" , index="resource_type" , aggfunc=lambda x:mode(x).mode[0])
resource_type_unq.head()

Unnamed: 0,resource_type,PercTrain,Mode_Severity
resource_type 8,10268,0.394527,0.0
resource_type 2,8918,0.401996,0.0
resource_type 6,582,0.424399,1.0
resource_type 7,498,0.451807,0.0
resource_type 4,330,0.436364,0.0


In [108]:
resource_type.loc[resource_type["resource_type"] == "resource_type 5"]

Unnamed: 0,id,resource_type,fault_severity,source
5653,5475,resource_type 5,,test
5655,5915,resource_type 5,,test
5657,9989,resource_type 5,,test
6936,7378,resource_type 5,2.0,train
7064,9373,resource_type 5,2.0,train
7073,9677,resource_type 5,,test
7223,6322,resource_type 5,,test
10786,15655,resource_type 5,,test
10790,11683,resource_type 5,,test
10793,1616,resource_type 5,2.0,train


In [110]:
resource_type_merge = resource_type.pivot_table(values="source" , index = "id" , columns="resource_type" , aggfunc=lambda x:len(x), fill_value=0)

In [111]:
data = data.merge(resource_type_merge , left_on="id" , right_index=True)

In [112]:
data.head()

Unnamed: 0,fault_severity,id,location,source,Remove_x,event_type 10,event_type 11,event_type 13,event_type 14,event_type 15,...,resource_type 1,resource_type 10,resource_type 2,resource_type 3,resource_type 4,resource_type 5,resource_type 6,resource_type 7,resource_type 8,resource_type 9
0,1.0,14121,location 118,train,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0.0,9320,location 91,train,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1.0,14394,location 152,train,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1.0,8218,location 931,train,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0.0,14804,location 120,train,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [113]:
data.shape

(18552, 183)

## Severity type

In [114]:
severity_type.head()

Unnamed: 0,id,severity_type
0,6597,severity_type 2
1,8011,severity_type 2
2,2597,severity_type 2
3,5022,severity_type 1
4,6852,severity_type 1


In [115]:
severity_type.shape

(18552, 2)

In [118]:
severity_type = severity_type.merge(data[["id" , "fault_severity" , "source"]], on = "id")
severity_type.head()

Unnamed: 0,id,severity_type,fault_severity,source
0,6597,severity_type 2,,test
1,8011,severity_type 2,0.0,train
2,2597,severity_type 2,,test
3,5022,severity_type 1,,test
4,6852,severity_type 1,,test


In [119]:
severity_type_unq = pd.DataFrame(severity_type["severity_type"].value_counts())
severity_type_unq.head()

Unnamed: 0,severity_type
severity_type 2,8737
severity_type 1,8728
severity_type 4,1014
severity_type 5,65
severity_type 3,8


In [120]:
severity_type_unq["PercTrain"] = severity_type.pivot_table(values="source" , index = "severity_type" , aggfunc=lambda x:sum(x == "train")/float(len(x)))
severity_type_unq.head()

Unnamed: 0,severity_type,PercTrain
severity_type 2,8737,0.411011
severity_type 1,8728,0.386687
severity_type 4,1014,0.382643
severity_type 5,65,0.353846
severity_type 3,8,0.5


In [122]:
severity_type_unq["Mode_Severity"] = severity_type.loc[severity_type["source"] == "train"].pivot_table(values="fault_severity" , index="severity_type" , aggfunc=lambda x:mode(x).mode[0])
severity_type_unq.head()

Unnamed: 0,severity_type,PercTrain,Mode_Severity
severity_type 2,8737,0.411011,0.0
severity_type 1,8728,0.386687,0.0
severity_type 4,1014,0.382643,0.0
severity_type 5,65,0.353846,0.0
severity_type 3,8,0.5,0.0


In [123]:
severity_type.loc[severity_type["source"] == "train"].pivot_table(values="fault_severity" , index="severity_type" , aggfunc=lambda x:mode(x))

Unnamed: 0_level_0,fault_severity
severity_type,Unnamed: 1_level_1
severity_type 1,"([0.0], [1778.0])"
severity_type 2,"([0.0], [2652.0])"
severity_type 3,"([0.0], [4.0])"
severity_type 4,"([0.0], [338.0])"
severity_type 5,"([0.0], [12.0])"


In [125]:
severity_type_unq["Mode_Severity"].value_counts()

0.0    5
Name: Mode_Severity, dtype: int64

In [127]:
severity_type_unq.shape

(5, 3)

In [130]:
severity_type_merge = severity_type.pivot_table(values="source" , index="id" , columns="severity_type" , aggfunc=lambda x:len(x) , fill_value=0)


In [131]:
severity_type_merge.head()

severity_type,severity_type 1,severity_type 2,severity_type 3,severity_type 4,severity_type 5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,0,0,0,0
2,0,1,0,0,0
3,1,0,0,0,0
4,0,0,0,1,0
5,0,1,0,0,0


In [132]:
event_type_merge.head()

preprocess,Remove,event_type 10,event_type 11,event_type 13,event_type 14,event_type 15,event_type 18,event_type 2,event_type 20,event_type 21,...,event_type 47,event_type 5,event_type 50,event_type 54,event_type 6,event_type 7,event_type 8,event_type others_0,event_type others_1,event_type others_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [133]:
data = data.merge(severity_type_merge , left_on="id" , right_index=True)

In [134]:
data.head()

Unnamed: 0,fault_severity,id,location,source,Remove_x,event_type 10,event_type 11,event_type 13,event_type 14,event_type 15,...,resource_type 5,resource_type 6,resource_type 7,resource_type 8,resource_type 9,severity_type 1,severity_type 2,severity_type 3,severity_type 4,severity_type 5
0,1.0,14121,location 118,train,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.0,9320,location 91,train,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1.0,14394,location 152,train,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1.0,8218,location 931,train,0,0,1,0,0,1,...,0,0,0,1,0,1,0,0,0,0
4,0.0,14804,location 120,train,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [135]:
data.shape

(18552, 188)

In [136]:
pred_event = [x for x in data.columns if "event_type" in x]

In [137]:
pred_event

['event_type 10',
 'event_type 11',
 'event_type 13',
 'event_type 14',
 'event_type 15',
 'event_type 18',
 'event_type 2',
 'event_type 20',
 'event_type 21',
 'event_type 22',
 'event_type 23',
 'event_type 24',
 'event_type 26',
 'event_type 27',
 'event_type 28',
 'event_type 29',
 'event_type 30',
 'event_type 32',
 'event_type 34',
 'event_type 35',
 'event_type 40',
 'event_type 42',
 'event_type 43',
 'event_type 44',
 'event_type 45',
 'event_type 46',
 'event_type 47',
 'event_type 5',
 'event_type 50',
 'event_type 54',
 'event_type 6',
 'event_type 7',
 'event_type 8',
 'event_type others_0',
 'event_type others_1',
 'event_type others_2']

In [138]:
print event_type.shape

(31170, 6)


In [140]:
data[pred_event].sum().sum()

31164L

In [141]:
pred_feat = [x for x in data.columns if "feature" in x]

In [143]:
print log_feature["volume"].sum()

568246


In [145]:
data[pred_feat].sum().sum()

567969L

In [146]:
log_feature.shape

(58671, 6)

In [147]:
pred_res = [x for x in data.columns if "resource" in x]

In [148]:
resource_type.shape

(21076, 4)

In [150]:
data[pred_res].sum().sum()

21076L

In [151]:
pred_sev = [x for x in data.columns if "severity_type" in x]

In [152]:
severity_type.shape

(18552, 4)

In [154]:
data[pred_sev].sum().sum()

18552L

## Add count variables

In [156]:
location_counts = data["location"].value_counts()
data["location_counts"] = data["location"].apply(lambda x:location_counts[x])

In [157]:
len(location_counts)

1126

In [158]:
featvar = [x for x in data.columns if "feature " in x]

In [159]:
data["feature_count"] = data[featvar].apply(np.sum , axis = 1)
data["feature_count"].sum()

567969L

In [160]:
data["feature_count"].value_counts()

2       2290
4       1538
1       1512
3       1180
6        971
5        779
8        683
10       522
7        482
12       438
9        400
14       377
16       319
11       317
13       265
20       262
18       249
15       238
22       221
17       209
24       178
26       167
19       158
21       156
30       133
27       132
25       129
28       127
23       122
29       113
        ... 
898        1
674        1
658        1
530        1
450        1
434        1
184        1
195        1
269        1
279        1
1084       1
428        1
343        1
444        1
460        1
492        1
524        1
684        1
1036       1
1580       1
1465       1
1155       1
995        1
627        1
451        1
419        1
371        1
355        1
339        1
991        1
Name: feature_count, Length: 509, dtype: int64

In [161]:
data.head()

Unnamed: 0,fault_severity,id,location,source,Remove_x,event_type 10,event_type 11,event_type 13,event_type 14,event_type 15,...,resource_type 7,resource_type 8,resource_type 9,severity_type 1,severity_type 2,severity_type 3,severity_type 4,severity_type 5,location_counts,feature_count
0,1.0,14121,location 118,train,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,97,38
1,0.0,9320,location 91,train,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,98,316
2,1.0,14394,location 152,train,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,12,2
3,1.0,8218,location 931,train,0,0,1,0,0,1,...,0,1,0,1,0,0,0,0,69,22
4,0.0,14804,location 120,train,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,19,12


In [162]:
data[["location" , "location_counts"]]

Unnamed: 0,location,location_counts
0,location 118,97
1,location 91,98
2,location 152,12
3,location 931,69
4,location 120,19
5,location 664,11
6,location 640,11
7,location 122,134
8,location 263,5
9,location 613,110


In [163]:
help(np.sum)

Help on function sum in module numpy.core.fromnumeric:

sum(a, axis=None, dtype=None, out=None, keepdims=<class numpy._globals._NoValue>)
    Sum of array elements over a given axis.
    
    Parameters
    ----------
    a : array_like
        Elements to sum.
    axis : None or int or tuple of ints, optional
        Axis or axes along which a sum is performed.  The default,
        axis=None, will sum all of the elements of the input array.  If
        axis is negative it counts from the last to the first axis.
    
        .. versionadded:: 1.7.0
    
        If axis is a tuple of ints, a sum is performed on all of the axes
        specified in the tuple instead of a single axis or all the axes as
        before.
    dtype : dtype, optional
        The type of the returned array and of the accumulator in which the
        elements are summed.  The dtype of `a` is used by default unless `a`
        has an integer dtype of less precision than the default platform
        integer.  In 

In [164]:
data[featvar]

Unnamed: 0,feature 101,feature 103,feature 105,feature 109,feature 118,feature 132,feature 133,feature 134,feature 135,feature 153,...,feature 82,feature 83,feature 85,feature 86,feature 87,feature 94,feature 95,feature others_0,feature others_1,feature others_2
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,12,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,4,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,30,0,0,0,0,0,0,0,0,0


In [165]:
le = LabelEncoder()

In [166]:
data["location"] = le.fit_transform(data["location"])

In [167]:
data.head()

Unnamed: 0,fault_severity,id,location,source,Remove_x,event_type 10,event_type 11,event_type 13,event_type 14,event_type 15,...,resource_type 7,resource_type 8,resource_type 9,severity_type 1,severity_type 2,severity_type 3,severity_type 4,severity_type 5,location_counts,feature_count
0,1.0,14121,148,train,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,97,38
1,0.0,9320,1027,train,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,98,316
2,1.0,14394,186,train,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,12,2
3,1.0,8218,1051,train,0,0,1,0,0,1,...,0,1,0,1,0,0,0,0,69,22
4,0.0,14804,151,train,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,19,12


In [168]:
[x for x in data.columns if "Remove" in x]

['Remove_x', 'Remove_y']

In [169]:
data.drop(["Remove_x" , "Remove_y"] , axis = 1 , inplace = True)

In [170]:
train_mod = data.loc[data["source"] == "train"]
test_mod = data.loc[data["source"] == "test"]

In [173]:
train_mod.drop("source" , axis = 1 , inplace=True)
test_mod.drop(["source" , "fault_severity"] , axis = 1 , inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [178]:
print sum(train["id"] != train_mod["id"])
# print sum(test["id"] != test_mod["id"])

0


In [186]:
train_mod.to_csv("data/train_modified_1.csv" , index = False)
test_mod.to_csv("data/test_modified_1.csv" , index = False)

In [187]:
test.dtypes

id           int64
location    object
source      object
dtype: object

In [188]:
test_mod.dtypes

id                  int64
location            int64
event_type 10       int64
event_type 11       int64
event_type 13       int64
event_type 14       int64
event_type 15       int64
event_type 18       int64
event_type 2        int64
event_type 20       int64
event_type 21       int64
event_type 22       int64
event_type 23       int64
event_type 24       int64
event_type 26       int64
event_type 27       int64
event_type 28       int64
event_type 29       int64
event_type 30       int64
event_type 32       int64
event_type 34       int64
event_type 35       int64
event_type 40       int64
event_type 42       int64
event_type 43       int64
event_type 44       int64
event_type 45       int64
event_type 46       int64
event_type 47       int64
event_type 5        int64
                    ...  
feature 8           int64
feature 80          int64
feature 81          int64
feature 82          int64
feature 83          int64
feature 85          int64
feature 86          int64
feature 87  