In [1]:
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Preproccess function：text -> token and word vector
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
stopwords_en = set(stopwords.words('english'))

__tokenization_pattern = r'''(?x)          # set flag to allow verbose regexps
        \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''
tokenizer = nltk.tokenize.regexp.RegexpTokenizer(__tokenization_pattern)

def preprocessor(text):
    stems = []
    tokens = tokenizer.tokenize(text.lower())
    for token in tokens:
        if token.isalpha() and token not in stopwords_en:
            stems.append(str(stemmer.stem(token)))
    return stems

bow_vectorizer = CountVectorizer(lowercase = False, 
                                 tokenizer = lambda x: x, # because we already have tokens available
                                 stop_words = None, ## stop words removal already done from NLTK
                                 max_features = 5000, ## pick top 5K words by frequency
                                 ngram_range = (1, 1), ## we want unigrams now
                                 binary = False) ## we want as binary/boolean features


In [59]:
# Get text from files and proccess them to word vector
path_base = 'dataset/'
path_years = ['2014/', '2015/', '2016/']
path_category = 'category'

token = []
name = []
x = list()
y = list()
c1 = 0
c2 = 0

for year in path_years:
    for category in ['1', '2']:
        path = path_base + year + path_category + category +'/'
        for filename in os.listdir(path):
            with open (path + filename, "r") as f:
                text = f.read().replace(u'\xa0', ' ').replace('\n', ' ')
                token.append(preprocessor(text))
                name.append(filename)
                y.append(category)
                if category == '1':
                    c1 += 1
                else:
                    c2 += 1
text_vec = bow_vectorizer.fit_transform(token)

TypeError: Required argument 'object' (pos 1) not found

In [58]:
print(len(y), 'documents')
print('category1:', c1, '\ncategory2:', c2)
print(name.shape)

891 documents
category1: 472 
category2: 419
()


In [65]:
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support

sum = 0
# print("accuracy for 50 times:\n")
# for i in range(50):

# Split the dataset to train set and test set
msk = np.random.rand(len(y)) < 0.75
le = LabelEncoder()

train_x = text_vec[msk]
test_x = text_vec[~msk]

y = le.fit_transform(y)
train_y = y[msk]
test_y = y[~msk]

name_id = np.arange(len(y))
train_name_id = name_id[msk]
test_name_id = name_id[~msk]
# Train with MultinomialNB

classifier = MultinomialNB()
classifier.fit(train_x, train_y)

# Get prediction
preds_bow = classifier.predict(test_x)
to_print = [le.inverse_transform(pred) for pred in preds_bow ]
# print(to_print)

for name_id, p in zip(test_name_id, classifier.predict_proba(test_x)):
    print(name[name_id], p)


IndexError: invalid index to scalar variable.

In [32]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

show_most_informative_features(bow_vectorizer, classifier, 1000)  

	-11.9521	accur          		-4.4762	compani        
	-11.9521	accuraci       		-4.6132	australia      
	-11.9521	acid           		-4.8164	factiva        
	-11.9521	acquisdata     		-4.9037	million        
	-11.9521	acton          		-4.9133	market         
	-11.9521	acu            		-5.0187	group          
	-11.9521	adani          		-5.0494	said           
	-11.9521	adaval         		-5.0925	news           
	-11.9521	advers         		-5.1030	develop        
	-11.9521	advoc          		-5.1409	china          
	-11.9521	aerospac       		-5.1486	properti       
	-11.9521	airli          		-5.2010	sale           
	-11.9521	aisc           		-5.2247	invest         
	-11.9521	alcoa          		-5.2247	australian     
	-11.9521	alcohol        		-5.2801	share          
	-11.9521	alic           		-5.3162	industri       
	-11.9521	alinta         		-5.3227	project        
	-11.9521	alloy          		-5.3360	billion        
	-11.9521	almond         		-5.3591	ltd            
	-11.9521	analyz         		-5.4

In [None]:
# Calculate accuracy
confusion = confusion_matrix(test_y, preds_bow)
acc_bow = accuracy_score(test_y, preds_bow)
precisions_bow, recalls_bow, f1_scores_bow, _ = precision_recall_fscore_support(test_y, preds_bow)
sum += acc_bow
# print(i+1, 'time')
print('accuracy', acc_bow)
print("{:>1} {:>4} {:>4} {:>4}".format("", "prec", "rec", "F1"))
for (idx, scores) in enumerate(zip(precisions_bow, recalls_bow, f1_scores_bow)):
    print("{:>1} {:.2f} {:.2f} {:.2f}".format(
        le.inverse_transform(idx), scores[0], scores[1], scores[2]
    ))
#     print('confusion matrix:\n{}'.format( confusion) )
# print("\n\n\naverage accuracy = {}".format(sum / 50))    
    