In [13]:
import numpy as np
import os
from prepo import preprocessor
import fasttext
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support

In [2]:
# Get text from files and proccess them to word vector
path_relevant = 'dataset/2014/relevant/'
path_unrelevant = 'dataset/2014/unrelevant/'

token = []
y = []

for filename in os.listdir(path_relevant):
     with open (path_relevant + filename, "r") as f:
        text = f.read().replace(u'\xa0', ' ').replace('\n', ' ')
        token.append(preprocessor(text))
        y.append(1)

for filename in os.listdir(path_unrelevant):
     with open (path_unrelevant + filename, "r") as f:
        text = f.read().replace(u'\xa0', ' ').replace('\n', ' ')
        token.append(preprocessor(text))
        y.append(0)

In [3]:
bow_vectorizer = CountVectorizer(lowercase=False,
                                 tokenizer=lambda x: x,
                                 stop_words=None, 
                                 max_features=5000,
                                 ngram_range=(1, 1),
                                 binary=False)
text_vec = bow_vectorizer.fit_transform(token)

In [11]:
# Split the dataset to train set and test set
msk = np.random.rand(len(y)) < 0.75

train_x = text_vec[msk]
test_x = text_vec[~msk]

y = np.array(y)
train_y = y[msk]
test_y = y[~msk]

In [6]:
with open('train.txt', 'w') as trainf, open('test.txt', 'w') as testf:
    for i in range(len(token)):
        if msk[i]:
            trainf.write('__label__{} {}\n'.format(y[i], ' '.join(token[i])))
        else:
            testf.write('__label__{} {}\n'.format(y[i], ' '.join(token[i])))

In [7]:
classifier = fasttext.supervised('train.txt', 'model')
result = classifier.test('test.txt')
print('P@1:', result.precision)
print('R@1:', result.recall)
print('Number of examples:', result.nexamples)

P@1: 0.9699124726477024
R@1: 0.9699124726477024
Number of examples: 1828


In [17]:
names = ['LogisticRegression', 'SVC' , 'NB', 'DecisionTree', 'KNC']
classifers = [LogisticRegression(), SVC(), MultinomialNB(), DecisionTreeClassifier(), KNeighborsClassifier(n_neighbors=1)]

for name, classifer in zip(names, classifers):
    print(name)
    print('------------------')
    classifer.fit(train_x, train_y)
    preds_bow = classifer.predict(test_x)
    confusion = confusion_matrix(test_y, preds_bow)
    acc_bow = accuracy_score(test_y, preds_bow)
    precisions_bow, recalls_bow, f1_scores_bow, _ = precision_recall_fscore_support(test_y, preds_bow)
    print('confusion matrix:\n{}'.format( confusion))
    print('\naccuracy: {:.4}'.format(acc_bow))
    print("\n{:>1} {:>4} {:>4} {:>4}".format("", "prec", "rec", "F1"))
    for (idx, scores) in enumerate(zip(precisions_bow, recalls_bow, f1_scores_bow)):
        print("{:>1} {:.2f} {:.2f} {:.2f}".format(
            idx, scores[0], scores[1], scores[2]
        ))
    print('\n\n')

LogisticRegression
------------------
confusion matrix:
[[1793    2]
 [   9   42]]

accuracy: 0.994

  prec  rec   F1
0 1.00 1.00 1.00
1 0.95 0.82 0.88



SVC
------------------
confusion matrix:
[[1795    0]
 [  50    1]]

accuracy: 0.9729

  prec  rec   F1
0 0.97 1.00 0.99
1 1.00 0.02 0.04



NB
------------------
confusion matrix:
[[1359  436]
 [   3   48]]

accuracy: 0.7622

  prec  rec   F1
0 1.00 0.76 0.86
1 0.10 0.94 0.18



DecisionTree
------------------
confusion matrix:
[[1779   16]
 [   7   44]]

accuracy: 0.9875

  prec  rec   F1
0 1.00 0.99 0.99
1 0.73 0.86 0.79



KNC
------------------
confusion matrix:
[[1762   33]
 [   8   43]]

accuracy: 0.9778

  prec  rec   F1
0 1.00 0.98 0.99
1 0.57 0.84 0.68





In [None]:
# Train with MultinomialNB
classifier = MultinomialNB()
classifier.fit(train_x, train_y)

# Get prediction
preds_bow = classifier.predict(test_x)
to_print = [le.inverse_transform(pred) for pred in preds_bow ]
# print(to_print)

result = []
for name_id, p, pre, iy in zip(test_name_id, classifier.predict_proba(test_x), classifier.predict(test_x), test_y):
    result.append((p[0], name[name_id], pre, iy))

result.sort(reverse=True)
rank = 1
for res in result:
    print('{rank:6} Filename: {filename:30} True Class: {iy}'.format(rank=rank, filename=res[1], iy = res[3]))
#     print('{rank:6} Filename: {filename:30} P: {p: 20f} Class: {iy}'.format(rank=rank, filename=res[1], p=res[0], iy = res[3]))
    rank += 1
    
result_train = []
for name_id, p, pre, iy in zip(train_name_id, classifier.predict_proba(train_x), classifier.predict(train_x), train_y):
    result_train.append((p[0], name[name_id], pre, iy))

result_train.sort(reverse=True)
rank = 1
for res in result_train:
    print('{rank:6} Filename: {filename:30} True Class: {iy}'.format(rank=rank, filename=res[1], iy = res[3]))
#     print('{rank:6} Filename: {filename:30} P: {p: 20f} Class: {iy}'.format(rank=rank, filename=res[1], p=res[0], iy = res[3]))
    rank += 1  

In [None]:
print(train_x.shape)

In [None]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

show_most_informative_features(bow_vectorizer, classifier, 1000)  