In [1]:
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
# Preproccess function：text -> token and word vector
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
stopwords_en = set(stopwords.words('english'))

__tokenization_pattern = r'''(?x)          # set flag to allow verbose regexps
        \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''
tokenizer = nltk.tokenize.regexp.RegexpTokenizer(__tokenization_pattern)

def preprocessor(text):
    stems = []
    tokens = tokenizer.tokenize(text.lower())
    for token in tokens:
        if token.isalpha() and token not in stopwords_en:
            stems.append(str(stemmer.stem(token)))
    return stems

bow_vectorizer = CountVectorizer(lowercase = False, 
                                 tokenizer = lambda x: x, # because we already have tokens available
                                 stop_words = None, ## stop words removal already done from NLTK
                                 max_features = 5000, ## pick top 5K words by frequency
                                 ngram_range = (1, 1), ## we want unigrams now
                                 binary = False) ## we want as binary/boolean features


In [15]:
# Get text from files and proccess them to word vector
path_base = 'dataset/'
path_years = ['2014/', '2015/', '2016/']
path_category = 'category'

token = list()
x = list()
y = list()

for year in path_years:
    for category in ['1', '2']:
        path = path_base + year + path_category + category +'/'
        for filename in os.listdir(path):
            with open (path + filename, "r") as f:
                text = f.read().replace(u'\xa0', ' ').replace('\n', ' ')
                token.append(preprocessor(text))
                y.append(category)
text_vec = bow_vectorizer.fit_transform(token)

In [21]:
print(len(y), 'documents')

891 document


In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support

sum = 0
for _ in range(50):
    # Split the dataset to train set and test set
    msk = np.random.rand(len(y)) < 0.75
    
    le = LabelEncoder()

    train_x = text_vec[msk]
    test_x = text_vec[~msk]

    y = le.fit_transform(y)
    train_y = y[msk]
    test_y = y[~msk]
    
    # Train with MultinomialNB
    
    classifier = MultinomialNB()
    classifier.fit(train_x, train_y)
    
    # Get prediction
    preds_bow = classifier.predict(test_x)
    to_print = [le.inverse_transform(pred) for pred in preds_bow ]
    # print(to_print)
    
    # Calculate accuracy
    confusion = confusion_matrix(test_y, preds_bow)
    acc_bow = accuracy_score(test_y, preds_bow)
    precisions_bow, recalls_bow, f1_scores_bow, _ = precision_recall_fscore_support(test_y, preds_bow)
    sum += acc_bow
    print("accuracy = {}".format(acc_bow))
print("\n\n\naverage accuracy = {}".format(sum / 50))    
#     print("{:>25} {:>4} {:>4} {:>4}".format("", "prec", "rec", "F1"))
#     for (idx, scores) in enumerate(zip(precisions_bow, recalls_bow, f1_scores_bow)):
#         print("{:>25} {:.2f} {:.2f} {:.2f}".format(
#             le.inverse_transform(idx), scores[0], scores[1], scores[2]
#         ))
#     print('confusion matrix:\n{}'.format( confusion) )

accuracy = 0.6946902654867256
accuracy = 0.6611570247933884
accuracy = 0.7027027027027027
accuracy = 0.6457399103139013
accuracy = 0.690677966101695
accuracy = 0.6946902654867256
accuracy = 0.6363636363636364
accuracy = 0.6308411214953271
accuracy = 0.6292682926829268
accuracy = 0.673728813559322
accuracy = 0.6325581395348837
accuracy = 0.6697247706422018
accuracy = 0.6483050847457628
accuracy = 0.6983471074380165
accuracy = 0.7103825136612022
accuracy = 0.6238938053097345
accuracy = 0.645
accuracy = 0.7090909090909091
accuracy = 0.6093023255813953
accuracy = 0.6504065040650406
accuracy = 0.6425339366515838
accuracy = 0.6267942583732058
accuracy = 0.6637554585152838
accuracy = 0.646551724137931
accuracy = 0.6796116504854369
accuracy = 0.6697247706422018
accuracy = 0.6303317535545023
accuracy = 0.6666666666666666
accuracy = 0.6583333333333333
accuracy = 0.6779661016949152
accuracy = 0.5964125560538116
accuracy = 0.6766169154228856
accuracy = 0.6198347107438017
accuracy = 0.6666666666666