In [52]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
import matplotlib.pyplot as plt
import seaborn as sn

In [58]:
labels = ['Accident', 'Cultural', 'Economic', 'Political', 'Research', 'Science','Social','Sport','State','World']
df = pd.read_csv('mydata.csv')
train_set = pd.DataFrame()


In [59]:
from __future__ import division
porpotion = dict()
for label in labels:
    porpotion[label] = len(df[df.label==label])
porpotion    

{'Accident': 112,
 'Cultural': 456,
 'Economic': 430,
 'Political': 309,
 'Research': 268,
 'Science': 238,
 'Social': 290,
 'Sport': 460,
 'State': 344,
 'World': 197}

In [60]:
frames = []
for label in labels:
    df_subset = df[df.label == label].sample(n=int(porpotion[label] * 0.7))
    df = df.drop(df_subset.index)
    frames.append(df_subset)
train_set = pd.concat(frames,ignore_index=True)    

In [61]:
df.reset_index(drop=True)
df.index = range(0,len(df))
train_set.index = range(0,len(train_set))

In [62]:
X_train = train_set.iloc[:,1:]
X_test = df.iloc[:,1:]

In [63]:
y_train = train_set.iloc[:,0]
y_test = df.iloc[:,0]

In [64]:
def plot_confusion_matrix(y_test, y_predict_test,labels):
    cm=confusion_matrix(y_test, y_predict_test,labels)
    df_cm = pd.DataFrame(cm, index = [i for i in labels],
                  columns = [i for i in labels])
    #plt.figure(figsize = (10,7))
    #sn.heatmap(df_cm, annot=True)
    #plt.show()
    return df_cm

In [65]:
#decision tree by spliting data two keep proportion of each class the same in test and train data set
clf = DecisionTreeClassifier(criterion='gini',max_depth=300, min_samples_split=2).fit(X_train, y_train)
y_predict_train = clf.predict(X_train)
y_predict_test = clf.predict(X_test)
print('Decision Tree')
print ('parameters: ' + 'criterion=gini,max_depth=300, min_samples_split=2')
print('Accuracy on Train set: ',accuracy_score(y_train, y_predict_train))
print('Accuracy on Test set: ',accuracy_score(y_test, y_predict_test))
plot_confusion_matrix(y_test, y_predict_test,labels)


Decision Tree
parameters: criterion=gini,max_depth=300, min_samples_split=2
('Accuracy on Train set: ', 0.93084370677731676)
('Accuracy on Test set: ', 0.45882352941176469)


Unnamed: 0,Accident,Cultural,Economic,Political,Research,Science,Social,Sport,State,World
Accident,12,3,2,3,1,1,3,1,8,0
Cultural,2,71,8,11,8,2,11,8,10,6
Economic,9,6,63,9,4,7,9,2,13,7
Political,2,17,9,25,13,1,6,2,5,13
Research,3,7,3,15,42,0,4,1,3,3
Science,2,7,4,2,1,45,2,0,7,2
Social,4,6,10,10,4,4,25,4,15,5
Sport,0,3,1,0,0,1,4,113,14,2
State,3,13,8,8,5,12,16,17,17,5
World,0,1,9,11,5,3,5,3,7,16


In [66]:
#decision tree by cross validation
x = pd.read_csv('mydata.csv')
X = x.iloc[:,1:]
y = x.iloc[:,0]
scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print scores.mean()

0.479982286815


In [67]:
#random forest by spliting data two keep proportion of each class the same in test and train data set
clf = RandomForestClassifier(max_depth=300,random_state=0, n_estimators=100, max_features ='sqrt', criterion='gini').fit(X_train, y_train)
y_predict_train = clf.predict(X_train)
y_predict_test = clf.predict(X_test)
print('\nRandom Forest')
print ('parameters: ' + 'max_depth=300,random_state=0, n_estimators=100, max_features =sqrt, criterion=gini')
print('Accuracy on Train set: ',accuracy_score(y_train, y_predict_train))
print('Accuracy on Test set: ',accuracy_score(y_test, y_predict_test))
cm = plot_confusion_matrix(y_test, y_predict_test,labels)
cm


Random Forest
parameters: max_depth=300,random_state=0, n_estimators=100, max_features =sqrt, criterion=gini
('Accuracy on Train set: ', 0.93084370677731676)
('Accuracy on Test set: ', 0.58502673796791449)


Unnamed: 0,Accident,Cultural,Economic,Political,Research,Science,Social,Sport,State,World
Accident,15,1,2,1,0,1,6,1,6,1
Cultural,0,103,5,7,5,1,4,3,6,3
Economic,2,4,101,4,4,1,4,1,6,2
Political,0,9,10,45,13,1,2,1,2,10
Research,0,5,6,6,56,1,3,1,2,1
Science,1,7,6,1,2,45,0,1,8,1
Social,1,12,10,13,3,2,28,4,12,2
Sport,0,2,0,1,1,0,1,129,4,0
State,2,17,20,12,4,11,9,17,11,1
World,0,5,7,20,5,0,1,3,5,14


In [68]:
#random forest by cross validation
scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print scores.mean()

0.603128556301


In [69]:
#svm by spliting data two keep proportion of each class the same in test and train data set
clf = svm.SVC(kernel='rbf', C= 100, gamma= 0.0001).fit(X_train, y_train)
y_predict_train = clf.predict(X_train)
y_predict_test = clf.predict(X_test)
print('\nSupport Vector Machine')
print('Accuracy on Train set: ',accuracy_score(y_train, y_predict_train))
print('Accuracy on Test set: ',accuracy_score(y_test, y_predict_test))
cm = plot_confusion_matrix(y_test, y_predict_test,labels)
cm


Support Vector Machine
('Accuracy on Train set: ', 0.90917473490087597)
('Accuracy on Test set: ', 0.54759358288770055)


Unnamed: 0,Accident,Cultural,Economic,Political,Research,Science,Social,Sport,State,World
Accident,14,2,2,1,0,2,7,1,5,0
Cultural,4,85,4,15,5,1,5,3,6,9
Economic,2,2,81,7,5,4,4,2,15,7
Political,0,14,7,36,10,0,9,0,8,9
Research,2,5,8,9,48,2,3,0,1,3
Science,1,7,8,2,1,42,3,1,6,1
Social,3,7,9,12,2,4,34,2,10,4
Sport,0,3,1,1,2,0,1,125,4,1
State,5,9,18,5,4,9,12,14,26,2
World,1,5,7,11,5,0,6,3,1,21


In [70]:
#svm by cross validation
scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print scores.mean()

0.580290429593


In [71]:
#MultinomialNB by spliting data two keep proportion of each class the same in test and train data set
clf = MultinomialNB().fit(X_train, y_train)
y_predict_train = clf.predict(X_train)
y_predict_test = clf.predict(X_test)
print('\nMultinomialNB Naive Bayes')
print('Accuracy on Train set: ',accuracy_score(y_train, y_predict_train))
print('Accuracy on Test set: ',accuracy_score(y_test, y_predict_test))
cm = plot_confusion_matrix(y_test, y_predict_test,labels)
cm


MultinomialNB Naive Bayes
('Accuracy on Train set: ', 0.61410788381742742)
('Accuracy on Test set: ', 0.57647058823529407)


Unnamed: 0,Accident,Cultural,Economic,Political,Research,Science,Social,Sport,State,World
Accident,29,0,1,0,0,1,1,1,0,1
Cultural,3,93,1,9,3,1,9,2,11,5
Economic,9,1,95,2,3,2,6,0,4,7
Political,2,11,4,48,2,0,5,0,1,20
Research,4,2,16,8,16,3,4,0,2,26
Science,6,2,4,1,1,52,2,0,2,2
Social,14,8,6,11,4,4,30,1,3,6
Sport,0,2,0,0,0,1,1,130,4,0
State,13,13,21,12,2,12,2,17,9,3
World,3,3,1,7,5,0,1,1,2,37


In [72]:
#knn by cross validatio
clf = KNeighborsClassifier(n_neighbors=9)
scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print scores.mean()

0.545829035342
