# Import

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Helper Functions

In [2]:
# Convenience function to output test items
def show_test_item(clf, i, text=True, classify=True):
    a = 'actual: ' + test.iloc[i].category
    p = ''
    if classify:
        p = ' / predicted: ' + clf.predict(vectorizer.transform([test.iloc[i].text]))[0]
    print('[' + str(i) + '] ' +a + p)
    if text:
        print('\n')
        print(test.iloc[i].text)

# Loading documents

In [3]:
# Read documents from json files
dir = 'd:/repository/daten/texte-politik-wirtschaft-sport/'
documents = pd.read_json(dir + 'texte.json', orient='records', encoding='utf8')
documents.iloc[[11, 100, 201]]

ValueError: Expected object or value

# Train Data and Test Data

In [4]:
random_state = 100
train_politik = documents[documents.category=='politik'].sample(frac=0.8, random_state=random_state)
train_wirtschaft = documents[documents.category=='wirtschaft'].sample(frac=0.8, random_state=random_state)
train_sport = documents[documents.category=='sport'].sample(frac=0.8, random_state=random_state)
train = pd.concat([train_politik, train_wirtschaft, train_sport])
test = documents.drop(train.index)

# Vectorizing of Documents

In [5]:
vectorizer = CountVectorizer(min_df=10, max_df=200, token_pattern='\w+')
vectorizer.fit(train.text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=200, max_features=None, min_df=10,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w+', tokenizer=None,
        vocabulary=None)

# Training of Multinomial Naive Bayes Classifier

In [6]:
X = vectorizer.transform(train.text)
y = train.category
clfmnb = MultinomialNB().fit(X, y)

# Evaluation of Multinomial Naive Bayes Classifier

In [7]:
actuals = test.category
predicted_mnb = clfmnb.predict(vectorizer.transform(test.text))
accuracy_score(actuals, predicted_mnb)

0.83333333333333337

In [8]:
for i in range(len(test)):
    show_test_item(clfmnb, i, text=False)

[0] actual: politik / predicted: politik
[1] actual: politik / predicted: politik
[2] actual: politik / predicted: politik
[3] actual: politik / predicted: politik
[4] actual: politik / predicted: politik
[5] actual: politik / predicted: politik
[6] actual: politik / predicted: politik
[7] actual: politik / predicted: politik
[8] actual: politik / predicted: politik
[9] actual: politik / predicted: politik
[10] actual: politik / predicted: politik
[11] actual: politik / predicted: politik
[12] actual: politik / predicted: politik
[13] actual: politik / predicted: politik
[14] actual: politik / predicted: politik
[15] actual: politik / predicted: politik
[16] actual: politik / predicted: wirtschaft
[17] actual: politik / predicted: politik
[18] actual: politik / predicted: politik
[19] actual: politik / predicted: politik
[20] actual: wirtschaft / predicted: politik
[21] actual: wirtschaft / predicted: wirtschaft
[22] actual: wirtschaft / predicted: politik
[23] actual: wirtschaft / pre

# Training of Support Vector Machine Classifier

In [9]:
clfsvm = SVC(kernel='linear', decision_function_shape='ovr', random_state=4711).fit(X, y)

# Evaluation of Support Vector Machine Classifier

In [10]:
predicted_svm = clfsvm.predict(vectorizer.transform(test.text))
accuracy_score(actuals, predicted_svm)

0.81666666666666665

In [11]:
for i in range(len(test)):
    show_test_item(clfsvm, i, text=False)

[0] actual: politik / predicted: politik
[1] actual: politik / predicted: politik
[2] actual: politik / predicted: politik
[3] actual: politik / predicted: politik
[4] actual: politik / predicted: politik
[5] actual: politik / predicted: politik
[6] actual: politik / predicted: politik
[7] actual: politik / predicted: politik
[8] actual: politik / predicted: politik
[9] actual: politik / predicted: politik
[10] actual: politik / predicted: politik
[11] actual: politik / predicted: politik
[12] actual: politik / predicted: politik
[13] actual: politik / predicted: politik
[14] actual: politik / predicted: politik
[15] actual: politik / predicted: politik
[16] actual: politik / predicted: wirtschaft
[17] actual: politik / predicted: politik
[18] actual: politik / predicted: politik
[19] actual: politik / predicted: politik
[20] actual: wirtschaft / predicted: wirtschaft
[21] actual: wirtschaft / predicted: sport
[22] actual: wirtschaft / predicted: politik
[23] actual: wirtschaft / predi

# Comparison of results

In [12]:
for i in range(len(test)):
    actual = test.iloc[i].category
    nb_pred = clfmnb.predict(vectorizer.transform([test.iloc[i].text]))[0]
    if actual != nb_pred:
        print('NB: [' + str(i) + '] - actual: ' + actual + '   predicted: ' + nb_pred)

NB: [16] - actual: politik   predicted: wirtschaft
NB: [20] - actual: wirtschaft   predicted: politik
NB: [22] - actual: wirtschaft   predicted: politik
NB: [24] - actual: wirtschaft   predicted: politik
NB: [32] - actual: wirtschaft   predicted: politik
NB: [34] - actual: wirtschaft   predicted: politik
NB: [35] - actual: wirtschaft   predicted: politik
NB: [36] - actual: wirtschaft   predicted: politik
NB: [40] - actual: sport   predicted: politik
NB: [44] - actual: sport   predicted: politik


In [13]:
for i in range(len(test)):
    actual = test.iloc[i].category
    svm_pred = clfsvm.predict(vectorizer.transform([test.iloc[i].text]))[0]
    if actual != svm_pred:
        print('SVM: [' + str(i) + '] - actual: ' + actual + '   predicted: ' + svm_pred)

SVM: [16] - actual: politik   predicted: wirtschaft
SVM: [21] - actual: wirtschaft   predicted: sport
SVM: [22] - actual: wirtschaft   predicted: politik
SVM: [24] - actual: wirtschaft   predicted: politik
SVM: [30] - actual: wirtschaft   predicted: politik
SVM: [34] - actual: wirtschaft   predicted: sport
SVM: [35] - actual: wirtschaft   predicted: politik
SVM: [40] - actual: sport   predicted: politik
SVM: [42] - actual: sport   predicted: politik
SVM: [44] - actual: sport   predicted: politik
SVM: [57] - actual: sport   predicted: wirtschaft


In [14]:
for i in range(len(test)):
    nb_pred = clfmnb.predict(vectorizer.transform([test.iloc[i].text]))[0]
    svm_pred = clfsvm.predict(vectorizer.transform([test.iloc[i].text]))[0]
    if nb_pred != svm_pred:
        print('[' + str(i) + '] - nb: ' + nb_pred + '   svm: ' + svm_pred)

[20] - nb: politik   svm: wirtschaft
[21] - nb: wirtschaft   svm: sport
[30] - nb: wirtschaft   svm: politik
[32] - nb: politik   svm: wirtschaft
[34] - nb: politik   svm: sport
[36] - nb: politik   svm: wirtschaft
[42] - nb: sport   svm: politik
[57] - nb: sport   svm: wirtschaft


In [None]:
show_test_item(clfsvm, 44)