In [1]:
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer

path_base = 'dataset/2014/category'

In [2]:
# Preproccess function：text -> token and word vector
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
stopwords_en = set(stopwords.words('english'))

__tokenization_pattern = r'''(?x)          # set flag to allow verbose regexps
        \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''
tokenizer = nltk.tokenize.regexp.RegexpTokenizer(__tokenization_pattern)

def preprocessor(text):
    stems = []
    tokens = tokenizer.tokenize(text.lower())
    for token in tokens:
        if token.isalpha() and token not in stopwords_en:
            stems.append(str(stemmer.stem(token)))
    return stems

bow_vectorizer = CountVectorizer(lowercase = False, 
                                 tokenizer = lambda x: x, # because we already have tokens available
                                 stop_words = None, ## stop words removal already done from NLTK
                                 max_features = 5000, ## pick top 5K words by frequency
                                 ngram_range = (1, 1), ## we want unigrams now
                                 binary = False) ## we want as binary/boolean features


In [3]:
# Get text from files and proccess them to word vector
token = list()
x = list()
y = list()

for category in ['1', '2']:
    path = path_base + category +'/'
    for filename in os.listdir(path):
        with open (path + filename, "r") as f:
            text = f.read().replace(u'\xa0', ' ').replace('\n', ' ')
            token.append(preprocessor(text))
            y.append(category)
text_vec = bow_vectorizer.fit_transform(token)

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support

sum = 0
for _ in range(50):
    # Split the dataset to train set and test set
    msk = np.random.rand(len(y)) < 0.75
    
    le = LabelEncoder()

    train_x = text_vec[msk]
    test_x = text_vec[~msk]

    y = le.fit_transform(y)
    train_y = y[msk]
    test_y = y[~msk]
    
    # Train with MultinomialNB
    
    classifier = MultinomialNB()
    classifier.fit(train_x, train_y)
    
    # Get prediction
    preds_bow = classifier.predict(test_x)
    to_print = [le.inverse_transform(pred) for pred in preds_bow ]
    # print(to_print)
    
    # Calculate accuracy
    confusion = confusion_matrix(test_y, preds_bow)
    acc_bow = accuracy_score(test_y, preds_bow)
    precisions_bow, recalls_bow, f1_scores_bow, _ = precision_recall_fscore_support(test_y, preds_bow)
    sum += acc_bow
    print("accuracy = {}".format(acc_bow))
print("\n\n\naverage accuracy = {}".format(sum / 50))    
#     print("{:>25} {:>4} {:>4} {:>4}".format("", "prec", "rec", "F1"))
#     for (idx, scores) in enumerate(zip(precisions_bow, recalls_bow, f1_scores_bow)):
#         print("{:>25} {:.2f} {:.2f} {:.2f}".format(
#             le.inverse_transform(idx), scores[0], scores[1], scores[2]
#         ))
#     print('confusion matrix:\n{}'.format( confusion) )

accuracy = 0.6567164179104478
accuracy = 0.6440677966101694
accuracy = 0.6981132075471698
accuracy = 0.8166666666666667
accuracy = 0.726027397260274
accuracy = 0.7045454545454546
accuracy = 0.6862745098039216
accuracy = 0.6607142857142857
accuracy = 0.7878787878787878
accuracy = 0.7605633802816901
accuracy = 0.75
accuracy = 0.7910447761194029
accuracy = 0.746031746031746
accuracy = 0.6
accuracy = 0.7397260273972602
accuracy = 0.7586206896551724
accuracy = 0.7619047619047619
accuracy = 0.6785714285714286
accuracy = 0.7936507936507936
accuracy = 0.6714285714285714
accuracy = 0.7450980392156863
accuracy = 0.7457627118644068
accuracy = 0.7068965517241379
accuracy = 0.6
accuracy = 0.65625
accuracy = 0.6964285714285714
accuracy = 0.6727272727272727
accuracy = 0.7
accuracy = 0.6470588235294118
accuracy = 0.7272727272727273
accuracy = 0.6935483870967742
accuracy = 0.6842105263157895
accuracy = 0.7333333333333333
accuracy = 0.7962962962962963
accuracy = 0.6551724137931034
accuracy = 0.640625
ac