In [1]:
import random
import string
import nltk
from nltk import re
import en_core_web_sm
from nltk.corpus import movie_reviews
from nltk.stem import WordNetLemmatizer
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
stemmer = WordNetLemmatizer()
stop_words = STOP_WORDS
digits = string.digits
verbose=0

In [3]:
def clean_word(raw_word,all_words):
    word=raw_word.lower()
    word=stemmer.lemmatize(word)
    if len(word)<=2 or any(map(str.isdigit, word)) or word in stop_words: 
        return None
    else:
        all_words.append(word)
        return word

In [4]:
def clean_review(review,all_words):
    for w in review[:]:
        word_res=clean_word(w,all_words)
        if word_res!=None:
            review[review.index(w)]=word_res
        else:
            review.remove(w)
    return review

In [5]:
def create_documents():
    documents=[]
    all_words=[]
    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            review=list(movie_reviews.words(fileid))
            review=clean_review(review,all_words)
            documents.append((review, category))
    if verbose==1:
        print (len(documents))
        print(documents[1])
    return documents,all_words

# In[17]:

In [6]:
def get_most_common_words(feat_num,all_words):
    all_words = nltk.FreqDist(all_words)
    
    most_common_words=all_words.most_common(feat_num)
    if verbose==1:
        print(len(all_words))
        print(*most_common_words[:100], sep='\n')
        print(len(most_common_words))
    return most_common_words

In [7]:
def feature_selector(most_common_words):
    nlp = en_core_web_sm.load()
    word_features=list()
    for word in most_common_words:
        docs = nlp(word[0])
        if docs[0].pos_=="ADJ" or docs[0].pos_=="VERB" or docs[0].pos_=="ADV":
            word_features.append(docs[0].text)
    return word_features


In [8]:
def get_documents_feature_sets(documents,word_features):
    balanced_documents=[]
    for i in range(0,int(len(documents)/2)):
            balanced_documents.append(documents[i])
            balanced_documents.append(documents[i+int(len(documents)/2)])
    featuresets = [(find_features(rev,documents,word_features), category) for (rev, category) in balanced_documents]
    random.shuffle(featuresets)
    if verbose==1:
        print (len(featuresets[1][0].keys()))
        balanced_documents[:2]
    return featuresets

In [9]:
def find_features(document,documents,word_features):
    words = set(document)
    features_prob = {}
    for w in word_features:
        word_freq = document.count(w) / len(documents)
        features_prob[w] = word_freq  ## compute frequency
    return features_prob

In [10]:
def calculate_metrics(cm):
    TP=cm._confusion[1][1]
    FP=cm._confusion[1][0]
    TN=cm._confusion[0][0]
    FN=cm._confusion[0][1]

    Recall=TP/(TP+FN)
    Precision=TP/(TP+FP)
    accuracy=(TP+TN)/(TP+TN+FP+FN)
    F1score=2*(Recall * Precision) / (Recall + Precision)
    if verbose==1:
        print("Recall is - ",Recall)
        print("Precision is - ",Precision)
        print("accuracy is - ",accuracy)
        print("F1score is - ",F1score)
    return Recall,Precision,accuracy,F1score

In [11]:
def run_Naive_Bayes_classification_system(feat_num,all_words):
    most_common_words=get_most_common_words(feat_num,all_words)
    word_features=feature_selector(most_common_words)
    if verbose==1:
        print (word_features[:100])
        print(len(word_features))
    featuresets=get_documents_feature_sets(documents,word_features)
    if verbose==1:
        print(len(featuresets))

    # set that we'll train our classifier with
    separator=int(len(featuresets)*0.9)
    training_set = featuresets[:separator]
    # set that we'll test against.
    testing_set = featuresets[separator:]

    # In[18]:
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    testing_set_content=[i[0] for i in testing_set]
    golden_label=[i[1] for i in testing_set]
    tested_label=classifier.classify_many(testing_set_content)
    cm = nltk.ConfusionMatrix(golden_label, tested_label)
    if verbose==1:
        print (cm)
        print("Classifier accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
        classifier.show_most_informative_features(50)

    Recall,Precision,accuracy,F1score=calculate_metrics(cm)
    all_words=[]
    return [Recall,Precision,accuracy,F1score]

In [12]:
def countAverageResults(results):
    Av_Recall=0
    Av_Precision=0
    Av_accuracy=0
    Av_F1score=0
    for result in results:
        Av_Recall+=result[0]/len(results)
        Av_Precision+=result[1]/len(results)
        Av_accuracy+=result[2]/len(results)
        Av_F1score+=result[3]/len(results)
    return {"Average Recall":"{:.3%}".format(Av_Recall), "Average Precision":"{:.3%}".format(Av_Precision), "Average accuracy":"{:.3%}".format(Av_accuracy), "Average F1 score":"{:.3%}".format(Av_F1score)}

In [13]:
def test_classification_system(tests, all_words,tests_num=3):
    for t in tests:
        i=0
        av_results=[]
        while i<tests_num:
            av_results.append(run_Naive_Bayes_classification_system(t,all_words))    
            i=i+1
        print({str(t)+" movie reviews":countAverageResults(av_results)})
           
        

In [14]:
documents,all_w = create_documents()
all_words=all_w
test_classification_system([1000,2000,4000,8000,10000,15000,20000],all_words)

{'1000 movie reviews': {'Average Recall': '81.010%', 'Average Precision': '75.068%', 'Average accuracy': '78.500%', 'Average F1 score': '77.851%'}}
{'2000 movie reviews': {'Average Recall': '86.071%', 'Average Precision': '68.037%', 'Average accuracy': '77.333%', 'Average F1 score': '75.992%'}}
{'4000 movie reviews': {'Average Recall': '83.851%', 'Average Precision': '71.870%', 'Average accuracy': '79.667%', 'Average F1 score': '77.252%'}}
{'8000 movie reviews': {'Average Recall': '83.138%', 'Average Precision': '74.506%', 'Average accuracy': '79.500%', 'Average F1 score': '78.413%'}}
{'10000 movie reviews': {'Average Recall': '83.999%', 'Average Precision': '72.730%', 'Average accuracy': '79.167%', 'Average F1 score': '77.931%'}}
{'15000 movie reviews': {'Average Recall': '83.498%', 'Average Precision': '72.222%', 'Average accuracy': '80.667%', 'Average F1 score': '77.436%'}}
{'20000 movie reviews': {'Average Recall': '81.820%', 'Average Precision': '77.455%', 'Average accuracy': '81.