In [31]:
import pandas as pd

import numpy as np
from IPython import embed
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier



In [32]:
def load_prepared_data():
	df_train = pd.read_csv('./data/output/processed_train.csv', header=0)
	df_test = pd.read_csv('./data/output/processed_test.csv', header=0)
	features = list(set(df_train.columns) - {'Vote'})
	target = 'Vote'

	df_train_X = df_train[features]
	df_train_Y = df_train[target]
	df_test_X = df_test[features]
	df_test_Y = df_test[target]
# 	labels = {"0":"Blues","1":"Browns","2":"Greens","3":"Greys","4":"Oranges","5":"Pinks","6":"Purples","7":"Reds","8":"Whites","9":"Yellows" }
 	labels = ["Blues","Browns","Greens","Greys","Oranges","Pinks","Purples","Reds","Whites","Yellows"]
	return df_train_X, df_train_Y, df_test_X, df_test_Y, labels


In [33]:
df_train_X, df_train_Y, df_test_X, df_test_Y, labels = load_prepared_data()

train_val_data = pd.concat([df_train_X])
features = train_val_data.values
target = pd.concat([df_train_Y]).values




In [34]:
clf = SVC(kernel='linear')
scores = cross_val_score(clf, features, target, cv=15)
print "linear %s score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

linear SVC score: 0.894135, std: 0.016005


In [35]:
clf = LinearSVC(multi_class='ovr')
scores = cross_val_score(clf, features, target, cv=15)
print "%s OVR Score: %f, std: %f" % (clf.__class__.__name__,np.mean(scores), np.std(scores))

LinearSVC OVR Score: 0.900571, std: 0.014538


In [36]:
clf = LinearSVC(multi_class='crammer_singer')
scores = cross_val_score(clf, features, target, cv=15)
print "%s crammer_singer Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

LinearSVC crammer_singer Score: 0.906420, std: 0.013568


In [37]:
svc = LinearSVC()
clf = OneVsOneClassifier(svc)
scores = cross_val_score(clf, features, target, cv=15)
print "%s Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

OneVsOneClassifier Score: 0.919214, std: 0.014935


In [38]:
clf = GaussianNB()
scores = cross_val_score(clf, features, target, cv=15)
print "%s Score: %f" % (clf.__class__.__name__, np.mean(scores))

GaussianNB Score: 0.866318


In [39]:
all_scores = []
for splitter in range(2,20):
    clf = DecisionTreeClassifier(min_samples_split=splitter, random_state=0)
    scores = cross_val_score(clf, features, target, cv=20)
    score = np.mean(scores)
    all_scores.append(score)
    print "minimus splitter = %d, score = %f" % (splitter, score)
print "Best Splitter size: %d" % (np.argmax(all_scores) + 2)
print "%s with best splitter: %f" % (clf.__class__.__name__, all_scores[np.argmax(all_scores)])

clf = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(clf, features, target, cv=20)
score = np.mean(scores)
print "%s Default score: %f"% (clf.__class__.__name__, score)

minimus splitter = 2, score = 0.925955
minimus splitter = 3, score = 0.926707
minimus splitter = 4, score = 0.929093
minimus splitter = 5, score = 0.930633
minimus splitter = 6, score = 0.928047
minimus splitter = 7, score = 0.929036
minimus splitter = 8, score = 0.929990
minimus splitter = 9, score = 0.928648
minimus splitter = 10, score = 0.928674
minimus splitter = 11, score = 0.929825
minimus splitter = 12, score = 0.928855
minimus splitter = 13, score = 0.925516
minimus splitter = 14, score = 0.925342
minimus splitter = 15, score = 0.924767
minimus splitter = 16, score = 0.923404
minimus splitter = 17, score = 0.923196
minimus splitter = 18, score = 0.922429
minimus splitter = 19, score = 0.922043
Best Splitter size: 5
DecisionTreeClassifier with best splitter: 0.930633
DecisionTreeClassifier Default score: 0.925955


In [40]:
all_scores = []
for splitter in range(2,20):
    clf = RandomForestClassifier(min_samples_split=splitter, random_state=0)
    scores = cross_val_score(clf, features, target, cv=20)
    score = np.mean(scores)
    all_scores.append(score)
    print "minimum splitter = %d, score = %f" % (splitter, score)
print "Best Splitter size: %d" % (np.argmax(all_scores) + 2)
print "%s with best splitter: %f" % (clf.__class__.__name__, all_scores[np.argmax(all_scores)])

clf = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, features, target, cv=20)
score = np.mean(scores)
print "%s Default score: %f"% (clf.__class__.__name__, score)

minimum splitter = 2, score = 0.946670
minimum splitter = 3, score = 0.948827
minimum splitter = 4, score = 0.951943
minimum splitter = 5, score = 0.947105
minimum splitter = 6, score = 0.947287
minimum splitter = 7, score = 0.946150
minimum splitter = 8, score = 0.945923
minimum splitter = 9, score = 0.949397
minimum splitter = 10, score = 0.944574
minimum splitter = 11, score = 0.946851
minimum splitter = 12, score = 0.943054
minimum splitter = 13, score = 0.946655
minimum splitter = 14, score = 0.945529
minimum splitter = 15, score = 0.947304
minimum splitter = 16, score = 0.947055
minimum splitter = 17, score = 0.946091
minimum splitter = 18, score = 0.942185
minimum splitter = 19, score = 0.946858
Best Splitter size: 4
RandomForestClassifier with best splitter: 0.951943
RandomForestClassifier Default score: 0.946670


In [41]:
all_scores = []
for n in range(2,20):
    clf = KNeighborsClassifier(n_neighbors=n)
    scores = cross_val_score(clf, features, target, cv=10)
    score = np.mean(scores)
    all_scores.append(score)
    print "minimum n_neighbors = %d, score = %f" % (n, score)
print "Best n_neighbors size: %d" % (np.argmax(all_scores) + 2)
print "KNeighborsClassifier with best N param: %f" % (all_scores[np.argmax(all_scores)])


minimum n_neighbors = 2, score = 0.905779
minimum n_neighbors = 3, score = 0.916074
minimum n_neighbors = 4, score = 0.912584
minimum n_neighbors = 5, score = 0.919978
minimum n_neighbors = 6, score = 0.916100
minimum n_neighbors = 7, score = 0.915899
minimum n_neighbors = 8, score = 0.913968
minimum n_neighbors = 9, score = 0.914135
minimum n_neighbors = 10, score = 0.912396
minimum n_neighbors = 11, score = 0.909670
minimum n_neighbors = 12, score = 0.910648
minimum n_neighbors = 13, score = 0.907146
minimum n_neighbors = 14, score = 0.906557
minimum n_neighbors = 15, score = 0.905389
minimum n_neighbors = 16, score = 0.902088
minimum n_neighbors = 17, score = 0.901114
minimum n_neighbors = 18, score = 0.899555
minimum n_neighbors = 19, score = 0.898974
Best n_neighbors size: 5
KNeighborsClassifier with best N param: 0.919978


In [42]:
clf = Perceptron(max_iter=300)
scores = cross_val_score(clf, features, target, cv=10)
print "%s Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

Perceptron Score: 0.880244, std: 0.016177


In [43]:
clf = LinearDiscriminantAnalysis()
scores = cross_val_score(clf, features, target, cv=10)
print "%s Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

LinearDiscriminantAnalysis Score: 0.847154, std: 0.016647


In [44]:
clf = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, features, target, cv=10)
print "%s Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

RandomForestClassifier Score: 0.945082, std: 0.010452


In [45]:
clf = RandomForestClassifier(max_features=None, random_state=0)
scores = cross_val_score(clf, features, target, cv=10)
print "%s Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

RandomForestClassifier Score: 0.945291, std: 0.009389


In [46]:
clf = MLPClassifier(verbose=0, activation='relu', hidden_layer_sizes=(50, 25, 10), 
                  random_state=0, max_iter=500, solver='sgd', 
                  learning_rate='invscaling', momentum=.9,
                  nesterovs_momentum=True, learning_rate_init=0.2)
scores = cross_val_score(clf, features, target, cv=10)
print "MLPClassifier Score: %f, std: %f" % (np.mean(scores), np.std(scores))


MLPClassifier Score: 0.808505, std: 0.086171


In [47]:
clf = DecisionTreeClassifier(min_samples_split=5, random_state=0)
pred = cross_val_predict(clf, features, target, cv=30)
print "***** %s *****" % clf.__class__.__name__
print classification_report(target, pred, target_names=labels, digits=5)

***** DecisionTreeClassifier *****
             precision    recall  f1-score   support

      Blues    0.40000   0.37037   0.38462        27
     Browns    0.89744   0.91418   0.90573      1072
     Greens    0.98905   0.99013   0.98959       912
      Greys    0.93696   0.94509   0.94101       346
    Oranges    0.88599   0.88312   0.88455       308
      Pinks    0.85062   0.83503   0.84275       491
    Purples    0.96595   0.95878   0.96235      1213
       Reds    0.93949   0.94249   0.94099       313
     Whites    0.73367   0.72277   0.72818       202
    Yellows    0.94048   0.93676   0.93861       253

avg / total    0.92301   0.92330   0.92312      5137



In [48]:
clf = KNeighborsClassifier(n_neighbors=5)
pred = cross_val_predict(clf, features, target, cv=30)
print "***** %s *****" % clf.__class__.__name__
print classification_report(target, pred, target_names=labels, digits=5)

***** KNeighborsClassifier *****
             precision    recall  f1-score   support

      Blues    0.50000   0.03704   0.06897        27
     Browns    0.86484   0.97295   0.91572      1072
     Greens    0.99232   0.99123   0.99177       912
      Greys    0.86479   0.88728   0.87589       346
    Oranges    0.89247   0.80844   0.84838       308
      Pinks    0.94203   0.79430   0.86188       491
    Purples    0.97930   0.97527   0.97728      1213
       Reds    0.84384   0.89776   0.86997       313
     Whites    0.82781   0.61881   0.70822       202
    Yellows    0.90647   0.99605   0.94915       253

avg / total    0.92093   0.92174   0.91814      5137



In [49]:
clf = RandomForestClassifier(min_samples_split=4, random_state=0)
pred = cross_val_predict(clf, features, target, cv=30)
print "***** %s *****" % clf.__class__.__name__
print classification_report(target, pred, target_names=labels, digits=5)

***** RandomForestClassifier *****
             precision    recall  f1-score   support

      Blues    0.92308   0.44444   0.60000        27
     Browns    0.89465   0.96642   0.92915      1072
     Greens    0.99561   0.99561   0.99561       912
      Greys    0.98802   0.95376   0.97059       346
    Oranges    0.93016   0.95130   0.94061       308
      Pinks    0.90562   0.82077   0.86111       491
    Purples    0.97042   0.97362   0.97202      1213
       Reds    0.95912   0.97444   0.96672       313
     Whites    0.93038   0.72772   0.81667       202
    Yellows    0.94382   0.99605   0.96923       253

avg / total    0.94784   0.94744   0.94616      5137



In [50]:
clf = RandomForestClassifier(min_samples_split=4, max_features=None, random_state=0)
pred = cross_val_predict(clf, features, target, cv=30)
print "***** %s *****" % clf.__class__.__name__
print classification_report(target, pred, target_names=labels, digits=5)

***** RandomForestClassifier *****
             precision    recall  f1-score   support

      Blues    0.54545   0.44444   0.48980        27
     Browns    0.90359   0.96175   0.93177      1072
     Greens    0.99780   0.99561   0.99671       912
      Greys    0.98489   0.94220   0.96307       346
    Oranges    0.90826   0.96429   0.93543       308
      Pinks    0.89805   0.84318   0.86975       491
    Purples    0.97107   0.96867   0.96987      1213
       Reds    0.97419   0.96486   0.96950       313
     Whites    0.86310   0.71782   0.78378       202
    Yellows    0.94553   0.96047   0.95294       253

avg / total    0.94437   0.94471   0.94389      5137



In [51]:
print "Training DecisionTreeClassifier"
k_fold = RepeatedStratifiedKFold(n_splits=10)
clf_tree = DecisionTreeClassifier(min_samples_split=5)
a = []
for train_indices, test_indices in k_fold.split(features, target):
    clf_tree.fit(features[train_indices], target[train_indices])
    a.append(clf_tree.score(features[test_indices],target[test_indices]))
    
print "training score, mean: %f"% (np.array(a).mean())    

Training DecisionTreeClassifier
training score, mean: 0.926039


In [52]:
print "Training KNeighborsClassifier"
k_fold = RepeatedStratifiedKFold(n_splits=10)
clf_knn = KNeighborsClassifier(n_neighbors=5)
a = []
for train_indices, test_indices in k_fold.split(features, target):
    clf_knn.fit(features[train_indices], target[train_indices])
    a.append(clf_knn.score(features[test_indices],target[test_indices]))
    
print "training score, mean: %f"% (np.array(a).mean())    

Training KNeighborsClassifier
training score, mean: 0.920734


In [53]:
print "Training RandomForestClassifier"
k_fold = RepeatedStratifiedKFold(n_splits=5, random_state=0)
clf_random_forest = RandomForestClassifier(min_samples_split=4, max_features=None, random_state=0)
a = []
for train_indices, test_indices in k_fold.split(features, target):
    clf_random_forest.fit(features[train_indices], target[train_indices])
    a.append(clf_random_forest.score(features[test_indices],target[test_indices]))
    
print "training score, mean: %f"% (np.array(a).mean())    

Training RandomForestClassifier
training score, mean: 0.942263


In [54]:
clf = clf_random_forest

In [55]:

features_test = df_test_X
target_test = df_test_Y
pred = clf.predict(features_test)

distribution = np.bincount(pred.astype('int64'))
most_common = np.argmax(distribution)

print "winner is party ## %s ##" % labels[most_common.astype('int')]


winner is party ## Purples ##


In [56]:
print "Vote distribution"
distribution = np.bincount(pred.astype('int64'))

for index,party in enumerate(distribution):
    print "%s, %f, %f"%(labels[index], distribution[index], distribution[index]/ float(target_test.size) * 100) + '%'


Vote distribution
Blues, 10.000000, 0.587199%
Browns, 400.000000, 23.487962%
Greens, 309.000000, 18.144451%
Greys, 94.000000, 5.519671%
Oranges, 104.000000, 6.106870%
Pinks, 167.000000, 9.806224%
Purples, 404.000000, 23.722842%
Reds, 104.000000, 6.106870%
Whites, 45.000000, 2.642396%
Yellows, 66.000000, 3.875514%


In [57]:
target_test_labled = target_test.map(lambda x: labels[int(x)])
pred_test_labled = pd.DataFrame(pred).applymap(lambda x: labels[int(x)])

print(classification_report(target_test_labled, pred_test_labled, target_names=labels))


             precision    recall  f1-score   support

      Blues       0.60      0.55      0.57        11
     Browns       0.91      0.97      0.94       377
     Greens       0.99      0.98      0.99       312
      Greys       0.98      0.93      0.95        99
    Oranges       0.87      0.93      0.90        97
      Pinks       0.92      0.92      0.92       167
    Purples       0.97      0.98      0.98       399
       Reds       0.95      0.93      0.94       106
     Whites       0.82      0.53      0.64        70
    Yellows       0.92      0.94      0.93        65

avg / total       0.94      0.94      0.94      1703



In [58]:

confusion_matrix(target_test_labled, pred_test_labled, labels=labels)

array([[  6,   0,   0,   0,   0,   0,   0,   0,   0,   5],
       [  0, 364,   2,   0,   0,   4,   1,   0,   6,   0],
       [  0,   0, 307,   0,   0,   3,   2,   0,   0,   0],
       [  0,   0,   0,  92,   7,   0,   0,   0,   0,   0],
       [  0,   0,   0,   2,  90,   0,   0,   5,   0,   0],
       [  0,  10,   0,   0,   0, 153,   2,   0,   2,   0],
       [  0,   2,   0,   0,   0,   4, 393,   0,   0,   0],
       [  0,   0,   0,   0,   7,   0,   0,  99,   0,   0],
       [  0,  24,   0,   0,   0,   3,   6,   0,  37,   0],
       [  4,   0,   0,   0,   0,   0,   0,   0,   0,  61]])

In [59]:
pred1 = pred_test_labled.values   
target1 = pd.DataFrame(target_test_labled).values

miss_vals = []
real_vals = []
toples = []

miss_count = 0
for i, j in enumerate(pred1):
    if pred1[i] != target1[i]:
        miss_vals.append(pred1[i][0])
        real_vals.append(target1[i][0])
        toples.append((pred1[i][0],target1[i][0]))
        miss_count = miss_count + 1


print "Total Wrong predictions %d out of %d, hit rate: %f"% (miss_count, target1.size, 100 - miss_count/float(target1.size) * 100) + '%'

Total Wrong predictions 101 out of 1703, hit rate: 94.069289%


In [60]:
pred_test_labled.to_csv("test_predictions.csv",header=['Vote'] ,index=False)