In [2]:
########################
#
# Kaggle Competition : Telstra Network Disruptions (Wed 25 Nov 2015 – Mon 29 Feb 2016)
#
# Clustering KModes as Preprocessing -> NN on Individual Clusters
# Written by -> Ansh Gandhi
#
########################

import time
start_time = time.clock()

from kmodes import kmodes
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
import pandas as pd
import numpy as np

clusterSize = 3
lb = LabelEncoder()
answer = pd.DataFrame(columns=["id","predict_0","predict_1","predict_2","class_lab"])

# 1) Run MultiLayerPerceptronClassifier on each Cluster.
def NNonCluster(clusterNumber):
    start_time_local = time.clock()
    data = train_data[km_train.labels_ == clusterNumber]
    data_target = train_data_target[km_train.labels_ == clusterNumber]
    test = test_data[km_test == clusterNumber]
    test_id = test_data_id[km_test == clusterNumber]
    print("ClusterNumber -> " + str(clusterNumber))
    #rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, max_features=None, warm_start=True)
    #gbc = GradientBoostingClassifier(n_estimators=500, max_features=None)
    mlpc = MLPClassifier(solver='adam', activation='logistic', hidden_layer_sizes=(6,3,), alpha=1e-5, warm_start=True, random_state=1, max_iter=1000)
    mlpc.fit(data, data_target)
    predic_prob = mlpc.predict_proba(test)
    print("\tIterations : "+str(mlpc.n_iter_))
    

    predic_prob = pd.DataFrame(data=predic_prob, columns=["predict_0","predict_1","predict_2"])
    test_id = pd.Series(data=np.array(test_id), index=None).to_frame(name="id")
    class_label = pd.DataFrame(columns=["class_lab"])
    local_answer = pd.concat([test_id, predic_prob, class_label], axis=1)
    print("\tapplying...addClassLabel")
    local_answer = local_answer.apply(addClassLabel, axis=1)
    print (str("\t")+str(time.clock() - start_time_local) + " seconds...")
    return local_answer


# 1) Adds the Class Label to the predicted dataframe row.
# 2) Whichever class has has the highest probability is appened to the row
def addClassLabel(x):
    prob_val = [ x["predict_0"], x["predict_1"], x["predict_2"] ]
    row = [x["id"], x["predict_0"], x["predict_1"], x["predict_2"], int(prob_val.index(np.max(prob_val)))]
    return pd.Series(row, index=["id","predict_0","predict_1","predict_2","class_lab"])


# 1) Returns the mean probabbility of the Dataframe passed for each class.
def meanProbability(x):
    return [ np.mean(x["predict_0"]) , np.mean(x["predict_1"]) , np.median(x["predict_2"]) ]


# 1) The predicted DataFrame has redundant id's due to the Merge operation of various datafiles.
# 2) For every unique ID, this function will take the Maximum vote of Classes and create a Dataframe
#    it unique ids.
def getMaximumVote(x):
    class_counts = x.class_lab.value_counts()
    num_of_groups = class_counts.size
    max_value = class_counts.iloc[0]
    count = 0
    list_of_df = []
    class_probability = []
    
    #get number of groups with equal MAximum Count of Classes
    for i in range(num_of_groups):
        if (max_value == class_counts.iloc[i]):
            count += 1

    if (num_of_groups == 1):
        row = [x["id"].iloc[0]]  + meanProbability(x) + [x["class_lab"].iloc[0]]
        return pd.Series(row, index=["id","predict_0","predict_1","predict_2","class_lab"])
    
    for i in range(count):
        list_of_df.append(x[x.class_lab == class_counts.index[i]])
        class_probability.append(meanProbability(list_of_df[i])[class_counts.index[0]])
        
    label = class_probability.index(np.max(class_probability))
    temp_df = x[x.class_lab == class_counts.index[label]]
    row = [x["id"].iloc[0]]  + meanProbability(temp_df) + [class_counts.index[label]]
    return pd.Series(row, index=["id","predict_0","predict_1","predict_2","class_lab"])



print("Reading files...")
train_data = pd.read_csv('~/Documents/Datasets/Kaggle/Telstra/train.csv')
event_type = pd.read_csv('~/Documents/Datasets/Kaggle/Telstra/event_type.csv')
log_feature = pd.read_csv('~/Documents/Datasets/Kaggle/Telstra/log_feature.csv')
resource_type = pd.read_csv('~/Documents/Datasets/Kaggle/Telstra/resource_type.csv')
severity_type = pd.read_csv('~/Documents/Datasets/Kaggle/Telstra/severity_type.csv')
test_data = pd.read_csv('~/Documents/Datasets/Kaggle/Telstra/test.csv')

# Join Operation on datafiles (Inner Join)
print("Merging data...")
data_list = [event_type, log_feature, resource_type, severity_type]
for i in data_list:
    train_data = pd.merge(train_data, i,on='id')

for i in data_list:
    test_data = pd.merge(test_data, i,on='id')

train_data.drop(['id'], inplace=True, axis=1)
test_data_id = test_data.id
test_data.drop(['id'], inplace=True, axis=1)

train_data_target = train_data["fault_severity"]
train_data.drop(['fault_severity'], inplace=True, axis=1)

print("KMode...")
km_train = kmodes.KModes(n_clusters=clusterSize, init='Cao', n_init=10, verbose=0).fit(train_data)
km_test = km_train.predict(test_data)

#Encoding Categorical to Numerical
colnames = ["location","event_type","log_feature","resource_type","severity_type"]
for i in colnames:
    train_data[i] = lb.fit_transform(train_data[i].astype('str'))
    test_data[i] = lb.fit_transform(test_data[i].astype('str'))

for i in range(clusterSize):
    answer = answer.append(NNonCluster(i))

answer["class_lab"] = answer["class_lab"].astype("int")
print("applying...getMaximumVote")
answer = answer.groupby("id").apply(getMaximumVote)

answer.drop(['class_lab'], inplace=True, axis=1)
answer["id"] = answer["id"].astype("int")

print("Writing to file...")
with open('./Documents/Datasets/Kaggle/Telstra/submit.csv', 'w') as f:
    answer.to_csv(f, header=True, index=False)
print("Success...")
print (str(time.clock() - start_time) + " seconds")

Reading files...
Merging data...
KMode...
ClusterNumber -> 0
	Iterations : 38
	applying...addClassLabel
	20.705961000000002 seconds...
ClusterNumber -> 1
	Iterations : 100
	applying...addClassLabel
	11.652249999999995 seconds...
ClusterNumber -> 2
	Iterations : 54
	applying...addClassLabel
	7.139673999999999 seconds...
applying...getMaximumVote
Writing to file...
Success...
64.50648899999999 seconds
