In [1]:
import pandas as pd

import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

In [2]:
def load_prepared_data():
	df_train = pd.read_csv('./data/output/processed_train.csv', header=0)
	df_test = pd.read_csv('./data/output/processed_test.csv', header=0)
	features = list(set(df_train.columns) - {'Vote'})
	target = 'Vote'

	df_train_X = df_train[features]
	df_train_Y = df_train[target]
	df_test_X = df_test[features]
	df_test_Y = df_test[target]
# 	labels = {"0":"Blues","1":"Browns","2":"Greens","3":"Greys","4":"Oranges","5":"Pinks","6":"Purples","7":"Reds","8":"Whites","9":"Yellows" }
 	labels = ["Blues","Browns","Greens","Greys","Oranges","Pinks","Purples","Reds","Whites","Yellows"]
	return df_train_X, df_train_Y, df_test_X, df_test_Y, labels

In [3]:
df_train_X, df_train_Y, df_test_X, df_test_Y, labels = load_prepared_data()

train_val_data = pd.concat([df_train_X])
features = train_val_data.values
target = pd.concat([df_train_Y]).values

In [4]:
clf = SVC(kernel='linear')
scores = cross_val_score(clf, features, target, cv=15)
print "linear %s score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

linear SVC score: 0.879021, std: 0.013906


In [5]:
clf = LinearSVC(multi_class='ovr')
scores = cross_val_score(clf, features, target, cv=15)
print "%s OVR Score: %f, std: %f" % (clf.__class__.__name__,np.mean(scores), np.std(scores))

LinearSVC OVR Score: 0.868753, std: 0.012212


In [6]:
clf = LinearSVC(multi_class='crammer_singer')
scores = cross_val_score(clf, features, target, cv=15)
print "%s crammer_singer Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

LinearSVC crammer_singer Score: 0.870310, std: 0.012343


In [7]:
clf = OneVsOneClassifier(LinearSVC())
scores = cross_val_score(clf, features, target, cv=15)
print "%s Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

OneVsOneClassifier Score: 0.905737, std: 0.014711


In [8]:
clf = GaussianNB()
scores = cross_val_score(clf, features, target, cv=15)
print "%s Score: %f" % (clf.__class__.__name__, np.mean(scores))

GaussianNB Score: 0.863502


In [9]:
all_scores = []
for splitter in range(2,20):
    clf = DecisionTreeClassifier(min_samples_split=splitter, random_state=0)
    scores = cross_val_score(clf, features, target, cv=15)
    score = np.mean(scores)
    all_scores.append(score)
    print "minimum splitter = %d, score = %f" % (splitter, score)
print "Best Splitter size: %d" % (np.argmax(all_scores) + 2)
print "%s with best splitter: %f" % (clf.__class__.__name__, all_scores[np.argmax(all_scores)])

clf = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(clf, features, target, cv=15)
score = np.mean(scores)
print "%s Default score: %f"% (clf.__class__.__name__, score)

minimum splitter = 2, score = 0.931029
minimum splitter = 3, score = 0.931037
minimum splitter = 4, score = 0.929864
minimum splitter = 5, score = 0.931403
minimum splitter = 6, score = 0.932374
minimum splitter = 7, score = 0.933361
minimum splitter = 8, score = 0.933549
minimum splitter = 9, score = 0.933537
minimum splitter = 10, score = 0.932958
minimum splitter = 11, score = 0.932960
minimum splitter = 12, score = 0.933149
minimum splitter = 13, score = 0.933525
minimum splitter = 14, score = 0.933131
minimum splitter = 15, score = 0.933141
minimum splitter = 16, score = 0.931596
minimum splitter = 17, score = 0.931994
minimum splitter = 18, score = 0.933155
minimum splitter = 19, score = 0.931619
Best Splitter size: 8
DecisionTreeClassifier with best splitter: 0.933549
DecisionTreeClassifier Default score: 0.931029


In [10]:
all_scores = []
for splitter in range(2,20):
    clf = RandomForestClassifier(min_samples_split=splitter, random_state=0)
    scores = cross_val_score(clf, features, target, cv=15)
    score = np.mean(scores)
    all_scores.append(score)
    print "minimum splitter = %d, score = %f" % (splitter, score)
print "Best Splitter size: %d" % (np.argmax(all_scores) + 2)
print "%s with best splitter: %f" % (clf.__class__.__name__, all_scores[np.argmax(all_scores)])

clf = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, features, target, cv=15)
score = np.mean(scores)
print "%s Default score: %f"% (clf.__class__.__name__, score)

minimum splitter = 2, score = 0.949628
minimum splitter = 3, score = 0.950242
minimum splitter = 4, score = 0.951995
minimum splitter = 5, score = 0.949627
minimum splitter = 6, score = 0.949261
minimum splitter = 7, score = 0.949468
minimum splitter = 8, score = 0.949853
minimum splitter = 9, score = 0.947952
minimum splitter = 10, score = 0.948108
minimum splitter = 11, score = 0.950255
minimum splitter = 12, score = 0.946556
minimum splitter = 13, score = 0.946952
minimum splitter = 14, score = 0.944035
minimum splitter = 15, score = 0.946767
minimum splitter = 16, score = 0.946762
minimum splitter = 17, score = 0.948711
minimum splitter = 18, score = 0.947346
minimum splitter = 19, score = 0.947522
Best Splitter size: 4
RandomForestClassifier with best splitter: 0.951995
RandomForestClassifier Default score: 0.949628


In [11]:
all_scores = []
for n in range(2,20):
    clf = KNeighborsClassifier(n_neighbors=n)
    scores = cross_val_score(clf, features, target, cv=15)
    score = np.mean(scores)
    all_scores.append(score)
    print "minimum k_neighbors = %d, score = %f" % (n, score)
print "Best n_neighbors size: %d" % (np.argmax(all_scores) + 2)
print "KNeighborsClassifier with best N param: %f" % (all_scores[np.argmax(all_scores)])

minimum k_neighbors = 2, score = 0.902213
minimum k_neighbors = 3, score = 0.918114
minimum k_neighbors = 4, score = 0.907826
minimum k_neighbors = 5, score = 0.916752
minimum k_neighbors = 6, score = 0.913081
minimum k_neighbors = 7, score = 0.912299
minimum k_neighbors = 8, score = 0.909593
minimum k_neighbors = 9, score = 0.910556
minimum k_neighbors = 10, score = 0.908232
minimum k_neighbors = 11, score = 0.906881
minimum k_neighbors = 12, score = 0.905926
minimum k_neighbors = 13, score = 0.903800
minimum k_neighbors = 14, score = 0.901478
minimum k_neighbors = 15, score = 0.900868
minimum k_neighbors = 16, score = 0.896038
minimum k_neighbors = 17, score = 0.896219
minimum k_neighbors = 18, score = 0.893895
minimum k_neighbors = 19, score = 0.890801
Best n_neighbors size: 3
KNeighborsClassifier with best N param: 0.918114


In [12]:
clf = Perceptron(max_iter=300)
scores = cross_val_score(clf, features, target, cv=10)
print "%s Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

Perceptron Score: 0.812833, std: 0.049600


In [13]:
clf = LinearDiscriminantAnalysis()
scores = cross_val_score(clf, features, target, cv=10)
print "%s Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

LinearDiscriminantAnalysis Score: 0.783866, std: 0.015864




In [14]:
clf = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, features, target, cv=10)
print "%s Score: %f, std: %f" % (clf.__class__.__name__, np.mean(scores), np.std(scores))

RandomForestClassifier Score: 0.947530, std: 0.008705


In [15]:
clf = MLPClassifier(verbose=0, activation='relu', hidden_layer_sizes=(50, 25, 10), 
                  random_state=0, max_iter=500, solver='sgd', 
                  learning_rate='invscaling', momentum=.9,
                  nesterovs_momentum=True, learning_rate_init=0.2)
scores = cross_val_score(clf, features, target, cv=10)
print "MLPClassifier Score: %f, std: %f" % (np.mean(scores), np.std(scores))



MLPClassifier Score: 0.695938, std: 0.102876


In [16]:
clf = DecisionTreeClassifier(min_samples_split=8, random_state=0)
pred = cross_val_predict(clf, features, target, cv=30)
print "***** %s *****" % clf.__class__.__name__
print classification_report(target, pred, target_names=labels, digits=5)



***** DecisionTreeClassifier *****
             precision    recall  f1-score   support

      Blues    0.46875   0.53571   0.50000        28
     Browns    0.90331   0.93340   0.91811      1081
     Greens    0.99473   0.99160   0.99316       952
      Greys    0.96970   0.96096   0.96531       333
    Oranges    0.89969   0.90252   0.90110       318
      Pinks    0.85972   0.85458   0.85714       502
    Purples    0.95703   0.94904   0.95302      1197
       Reds    0.93103   0.93987   0.93543       316
     Whites    0.78107   0.69841   0.73743       189
    Yellows    0.95473   0.93548   0.94501       248

avg / total    0.92977   0.92971   0.92958      5164



In [17]:
clf = KNeighborsClassifier(n_neighbors=3)
pred = cross_val_predict(clf, features, target, cv=30)
print "***** %s *****" % clf.__class__.__name__
print classification_report(target, pred, target_names=labels, digits=5)

***** KNeighborsClassifier *****
             precision    recall  f1-score   support

      Blues    0.26087   0.21429   0.23529        28
     Browns    0.88003   0.97040   0.92301      1081
     Greens    0.99260   0.98634   0.98946       952
      Greys    0.85373   0.85886   0.85629       333
    Oranges    0.85507   0.74214   0.79461       318
      Pinks    0.95561   0.81474   0.87957       502
    Purples    0.96787   0.98162   0.97470      1197
       Reds    0.82303   0.92722   0.87202       316
     Whites    0.90780   0.67725   0.77576       189
    Yellows    0.91304   0.93145   0.92216       248

avg / total    0.92102   0.92022   0.91853      5164



In [18]:
clf = RandomForestClassifier(min_samples_split=4, random_state=0)
pred = cross_val_predict(clf, features, target, cv=30)
print "***** %s *****" % clf.__class__.__name__
print classification_report(target, pred, target_names=labels, digits=5)

***** RandomForestClassifier *****
             precision    recall  f1-score   support

      Blues    0.86667   0.46429   0.60465        28
     Browns    0.90378   0.97317   0.93719      1081
     Greens    0.99579   0.99475   0.99527       952
      Greys    0.96429   0.97297   0.96861       333
    Oranges    0.94481   0.91509   0.92971       318
      Pinks    0.94889   0.85060   0.89706       502
    Purples    0.97828   0.97828   0.97828      1197
       Reds    0.94753   0.97152   0.95938       316
     Whites    0.89809   0.74603   0.81503       189
    Yellows    0.93893   0.99194   0.96471       248

avg / total    0.95278   0.95256   0.95149      5164



In [19]:
print "Estimating DecisionTreeClassifier"
k_fold = RepeatedStratifiedKFold(n_splits=10)
clf_tree = DecisionTreeClassifier(min_samples_split=8)
a = []
for train_indices, test_indices in k_fold.split(features, target):
    clf_tree.fit(features[train_indices], target[train_indices])
    a.append(clf_tree.score(features[test_indices],target[test_indices]))
    
print "training score, mean: %f"% (np.array(a).mean())

Estimating DecisionTreeClassifier
training score, mean: 0.927192


In [20]:
print "Estimating KNeighborsClassifier"
k_fold = RepeatedStratifiedKFold(n_splits=10)
clf_knn = KNeighborsClassifier(n_neighbors=5)
a = []
for train_indices, test_indices in k_fold.split(features, target):
    clf_knn.fit(features[train_indices], target[train_indices])
    a.append(clf_knn.score(features[test_indices],target[test_indices]))
    
print "training score, mean: %f"% (np.array(a).mean())

Estimating KNeighborsClassifier
training score, mean: 0.916012


In [21]:
print "Estimating RandomForestClassifier"
k_fold = RepeatedStratifiedKFold(n_splits=5, random_state=0)
clf_random_forest = RandomForestClassifier(min_samples_split=4, max_features=None, random_state=0)
a = []
for train_indices, test_indices in k_fold.split(features, target):
    clf_random_forest.fit(features[train_indices], target[train_indices])
    a.append(clf_random_forest.score(features[test_indices],target[test_indices]))
    
print "training score, mean: %f"% (np.array(a).mean())

Estimating RandomForestClassifier
training score, mean: 0.945433


In [22]:
features_test = df_test_X
target_test = df_test_Y

In [31]:
features_test.describe()

Unnamed: 0,Is_Most_Important_Issue_Other,Is_Most_Important_Issue_Financial,Yearly_IncomeK,Number_of_valued_Kneset_members,Is_Most_Important_Issue_Environment,Is_Most_Important_Issue_Military,Avg_Satisfaction_with_previous_vote,Will_vote_only_large_party,Avg_monthly_expense_when_under_age_21,Garden_sqr_meter_per_person_in_residancy_area,Overall_happiness_score,Is_Most_Important_Issue_Education,Is_Most_Important_Issue_Social,Is_Most_Important_Issue_Foreign_Affairs
count,1706.0,1706.0,1706.0,1706.0,1706.0,1706.0,1706.0,1706.0,1706.0,1706.0,1706.0,1706.0,1706.0,1706.0
mean,0.063892,0.189332,0.538825,0.434616,0.158851,0.062134,0.398813,-0.72626,0.398819,0.466526,0.620756,0.162954,0.146542,0.058617
std,0.244632,0.391887,0.094215,0.246032,0.365644,0.241469,0.088379,0.536724,0.088383,0.127887,0.110341,0.369432,0.353752,0.234975
min,0.0,0.0,0.303086,0.0,0.0,0.0,0.105105,-1.0,0.105105,0.107365,0.325936,0.0,0.0,0.0
25%,0.0,0.0,0.469703,0.272727,0.0,0.0,0.330104,-1.0,0.330104,0.363117,0.537594,0.0,0.0,0.0
50%,0.0,0.0,0.526256,0.454545,0.0,0.0,0.381287,-1.0,0.381287,0.454622,0.599437,0.0,0.0,0.0
75%,0.0,0.0,0.597343,0.636364,0.0,0.0,0.457536,-1.0,0.457536,0.546125,0.690365,0.0,0.0,0.0
max,1.0,1.0,0.85594,1.0,1.0,1.0,0.633064,1.0,0.633064,0.829079,0.912641,1.0,1.0,1.0


In [23]:
clf = RandomForestClassifier(min_samples_split=4, random_state=0)
clf.fit(features, target)

# clf = KNeighborsClassifier(n_neighbors=3)
# clf.fit(features, target)

# clf = DecisionTreeClassifier(min_samples_split=8)
# clf.fit(features, target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [24]:
pred = clf.predict(features_test)

distribution = np.bincount(pred.astype('int64'))
most_common = np.argmax(distribution)

print "winner is party ## %s ##" % labels[most_common.astype('int')]

winner is party ## Purples ##


In [25]:
print "Vote distribution"
distribution = np.bincount(pred.astype('int64'))

for index,party in enumerate(distribution):
    print "%s, %f, %f"%(labels[index], distribution[index], distribution[index]/ float(target_test.size) * 100) + '%'

Vote distribution
Blues, 3.000000, 0.175850%
Browns, 399.000000, 23.388042%
Greens, 315.000000, 18.464244%
Greys, 102.000000, 5.978898%
Oranges, 95.000000, 5.568581%
Pinks, 144.000000, 8.440797%
Purples, 410.000000, 24.032825%
Reds, 118.000000, 6.916764%
Whites, 47.000000, 2.754982%
Yellows, 73.000000, 4.279015%


In [26]:
target_test_labled = target_test.map(lambda x: labels[int(x)])
pred_test_labled = pd.DataFrame(pred).applymap(lambda x: labels[int(x)])

print(classification_report(target_test_labled, pred_test_labled, target_names=labels))

             precision    recall  f1-score   support

      Blues       1.00      0.50      0.67         6
     Browns       0.89      0.97      0.93       365
     Greens       0.99      1.00      1.00       312
      Greys       0.93      0.92      0.93       103
    Oranges       0.87      0.89      0.88        93
      Pinks       0.93      0.85      0.89       158
    Purples       0.97      0.98      0.98       406
       Reds       0.97      0.96      0.96       119
     Whites       0.89      0.57      0.69        74
    Yellows       0.96      1.00      0.98        70

avg / total       0.94      0.94      0.94      1706



In [27]:
print labels
confusion_matrix(target_test_labled, pred_test_labled, labels=labels)

['Blues', 'Browns', 'Greens', 'Greys', 'Oranges', 'Pinks', 'Purples', 'Reds', 'Whites', 'Yellows']


array([[  3,   0,   0,   0,   0,   0,   0,   0,   0,   3],
       [  0, 355,   0,   0,   0,   2,   3,   0,   5,   0],
       [  0,   0, 312,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,  95,   7,   0,   0,   1,   0,   0],
       [  0,   0,   0,   7,  83,   0,   0,   3,   0,   0],
       [  0,  19,   0,   0,   0, 134,   5,   0,   0,   0],
       [  0,   2,   3,   0,   0,   3, 398,   0,   0,   0],
       [  0,   0,   0,   0,   5,   0,   0, 114,   0,   0],
       [  0,  23,   0,   0,   0,   5,   4,   0,  42,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  70]])

In [28]:
pred1 = pred_test_labled.values   
target1 = pd.DataFrame(target_test_labled).values

miss_vals = []
real_vals = []
toples = []

miss_count = 0
for i, j in enumerate(pred1):
    if pred1[i] != target1[i]:
        miss_vals.append(pred1[i][0])
        real_vals.append(target1[i][0])
        toples.append((pred1[i][0],target1[i][0]))
        miss_count = miss_count + 1


print "Total Wrong predictions %d out of %d, hit rate: %f"% (miss_count, target1.size, 100 - miss_count/float(target1.size) * 100) + '%'

Total Wrong predictions 100 out of 1706, hit rate: 94.138335%


In [29]:
pred_test_labled.to_csv("./data/output/test_predictions.csv",header=['Vote'] ,index=False)