In [1]:
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
# Preproccess function：text -> token and word vector
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
stopwords_en = set(stopwords.words('english'))

__tokenization_pattern = r'''(?x)          # set flag to allow verbose regexps
        \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''
tokenizer = nltk.tokenize.regexp.RegexpTokenizer(__tokenization_pattern)

def preprocessor(text):
    stems = []
    tokens = tokenizer.tokenize(text.lower())
    for token in tokens:
        if token.isalpha() and token not in stopwords_en:
            stems.append(str(stemmer.stem(token)))
    return stems

bow_vectorizer = CountVectorizer(lowercase = False, 
                                 tokenizer = lambda x: x, # because we already have tokens available
                                 stop_words = None, ## stop words removal already done from NLTK
                                 max_features = 5000, ## pick top 5K words by frequency
                                 ngram_range = (1, 1), ## we want unigrams now
                                 binary = False) ## we want as binary/boolean features


In [39]:
# Get text from files and proccess them to word vector
path_base = 'dataset/'
path_years = ['2014/', '2015/', '2016/']
path_category = 'category'

token = list()
x = list()
y = list()
c1 = 0
c2 = 0

for year in path_years:
    for category in ['1', '2']:
        path = path_base + year + path_category + category +'/'
        for filename in os.listdir(path):
            with open (path + filename, "r") as f:
                text = f.read().replace(u'\xa0', ' ').replace('\n', ' ')
                token.append(preprocessor(text))
                y.append(category)
                if category == '1':
                    c1 += 1
                else:
                    c2 += 1
text_vec = bow_vectorizer.fit_transform(token)

In [42]:
print(len(y), 'documents')
print('category1:', c1, '\ncategory2:', c2)

891 documents
category1: 472 
category2: 419


In [51]:
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support

sum = 0
print("accuracy for 50 times:\n")
for i in range(50):
    # Split the dataset to train set and test set
    msk = np.random.rand(len(y)) < 0.75
    
    le = LabelEncoder()

    train_x = text_vec[msk]
    test_x = text_vec[~msk]

    y = le.fit_transform(y)
    train_y = y[msk]
    test_y = y[~msk]
    
    # Train with MultinomialNB
    
    classifier = MultinomialNB()
    classifier.fit(train_x, train_y)
    
    # Get prediction
    preds_bow = classifier.predict(test_x)
    to_print = [le.inverse_transform(pred) for pred in preds_bow ]
    # print(to_print)
    
    
    # Calculate accuracy
    confusion = confusion_matrix(test_y, preds_bow)
    acc_bow = accuracy_score(test_y, preds_bow)
    precisions_bow, recalls_bow, f1_scores_bow, _ = precision_recall_fscore_support(test_y, preds_bow)
    sum += acc_bow
    print(i+1, 'time')
    print('accuracy', acc_bow)
    print("{:>1} {:>4} {:>4} {:>4}".format("", "prec", "rec", "F1"))
    for (idx, scores) in enumerate(zip(precisions_bow, recalls_bow, f1_scores_bow)):
        print("{:>1} {:.2f} {:.2f} {:.2f}".format(
            le.inverse_transform(idx), scores[0], scores[1], scores[2]
        ))
    print()
#     print('confusion matrix:\n{}'.format( confusion) )
    
print("\n\n\naverage accuracy = {}".format(sum / 50))    
    

accuracy for 50 times:

1 time
accuracy 0.646766169154
  prec  rec   F1
0 0.77 0.51 0.61
1 0.58 0.81 0.68

2 time
accuracy 0.627192982456
  prec  rec   F1
0 0.70 0.54 0.61
1 0.58 0.72 0.64

3 time
accuracy 0.654166666667
  prec  rec   F1
0 0.61 0.68 0.65
1 0.70 0.63 0.66

4 time
accuracy 0.64
  prec  rec   F1
0 0.73 0.58 0.65
1 0.57 0.71 0.63

5 time
accuracy 0.617924528302
  prec  rec   F1
0 0.74 0.48 0.58
1 0.55 0.79 0.65

6 time
accuracy 0.636752136752
  prec  rec   F1
0 0.65 0.63 0.64
1 0.62 0.64 0.63

7 time
accuracy 0.623430962343
  prec  rec   F1
0 0.69 0.57 0.62
1 0.57 0.69 0.62

8 time
accuracy 0.657894736842
  prec  rec   F1
0 0.67 0.61 0.64
1 0.65 0.70 0.68

9 time
accuracy 0.668103448276
  prec  rec   F1
0 0.72 0.60 0.65
1 0.63 0.75 0.68

10 time
accuracy 0.651260504202
  prec  rec   F1
0 0.72 0.54 0.62
1 0.61 0.77 0.68

11 time
accuracy 0.593073593074
  prec  rec   F1
0 0.59 0.68 0.63
1 0.60 0.50 0.54

12 time
accuracy 0.635964912281
  prec  rec   F1
0 0.69 0.59 0.63
1 0.5