In [1]:
import random
import string
import nltk
from nltk import re
import en_core_web_sm
from nltk.corpus import movie_reviews
from nltk.stem import WordNetLemmatizer
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
stemmer = WordNetLemmatizer()
stop_words = STOP_WORDS
digits = string.digits
global verbose #if the value is 1 - we output all information, if it is 0 - we output only results of average metrics` values
verbose=1

In [3]:
def clean_word(raw_word,all_words):
    word=raw_word.lower()
    word=stemmer.lemmatize(word)
    if len(word)<=2 or any(map(str.isdigit, word)) or word in stop_words: 
        return None
    else:
        all_words.append(word)
        return word

In [4]:
def clean_review(review,all_words):
    for w in review[:]:
        word_res=clean_word(w,all_words)
        if word_res!=None:
            review[review.index(w)]=word_res
        else:
            review.remove(w)
    return review

In [5]:
def create_documents():
    documents=[]
    all_words=[]
    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            review=list(movie_reviews.words(fileid))
            review=clean_review(review,all_words)
            documents.append((review, category))
    if verbose==1:
        print (len(documents))
        print(documents[1])
    return documents,all_words

# In[17]:

In [6]:
def get_most_common_words(word_features_number,all_words):
    all_words = nltk.FreqDist(all_words)
    
    most_common_words=all_words.most_common(word_features_number)
    if verbose==1:
        print(len(all_words))
        print(*most_common_words[:100], sep='\n')
        print(len(most_common_words))
    return most_common_words

In [7]:
def feature_selector(most_common_words):
    nlp = en_core_web_sm.load()
    word_features=list()
    tags=["ADJ","ADP","ADV","AUX","CONJ","CCONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","SYM","VERB","X","SPACE"]
    for word in most_common_words:
        docs = nlp(word[0])
        if docs[0].pos_=="ADJ" or docs[0].pos_=="VERB" or docs[0].pos_=="ADV":
            word_features.append(docs[0].text)
#         else:
#             if docs[0].pos_ not in tags:
#                 print(docs[0].text,docs[0].pos_)
    return word_features


In [8]:
def get_documents_feature_sets(documents,word_features):
    balanced_documents=[]
    for i in range(0,int(len(documents)/2)):
            balanced_documents.append(documents[i])
            balanced_documents.append(documents[i+int(len(documents)/2)])
    featuresets = [(find_features(rev,documents,word_features), category) for (rev, category) in balanced_documents]
    random.shuffle(featuresets)
    if verbose==1:
        print (len(featuresets[1][0].keys()))
        balanced_documents[:2]
    return featuresets


In [9]:
def find_features(document,documents,word_features):
    words = set(document)
    features_prob = {}
    for w in word_features:
        word_freq = document.count(w) / len(documents)
        features_prob[w] = word_freq  ## compute frequency
    return features_prob

In [10]:
def calculate_metrics(cm):
    TP=cm._confusion[1][1]
    FP=cm._confusion[1][0]
    TN=cm._confusion[0][0]
    FN=cm._confusion[0][1]

    Recall=TP/(TP+FN)
    Precision=TP/(TP+FP)
    accuracy=(TP+TN)/(TP+TN+FP+FN)
    F1score=2*(Recall * Precision) / (Recall + Precision)
    if verbose==1:
        print("Recall is - ",Recall)
        print("Precision is - ",Precision)
        print("accuracy is - ",accuracy)
        print("F1score is - ",F1score)
    return Recall,Precision,accuracy,F1score

In [11]:
def run_Naive_Bayes_classification_system(word_features_number,all_words):
    most_common_words=get_most_common_words(word_features_number,all_words)
    word_features=feature_selector(most_common_words)
    if verbose==1:
        print (word_features[:100])
        print(len(word_features))
    feature_sets=get_documents_feature_sets(documents,word_features)
    if verbose==1:
        print(len(feature_sets))

    # set that we'll train our classifier with
    training_set_separator=int(len(feature_sets)*0.9)
    training_set = feature_sets[:training_set_separator]
    # set that we'll test against.
    testing_set = feature_sets[training_set_separator:]

    # In[18]:
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    testing_set_content=[i[0] for i in testing_set]
    golden_label=[i[1] for i in testing_set]
    tested_label=classifier.classify_many(testing_set_content)
    cm = nltk.ConfusionMatrix(golden_label, tested_label)
    if verbose==1:
        print (cm)
        print("Classifier accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
        classifier.show_most_informative_features(50)

    recall,precision,accuracy,F1score=calculate_metrics(cm)
    all_words=[]
    return [recall,precision,accuracy,F1score]

In [12]:
def count_average_results(results):
    average_recall=0
    average_precision=0
    average_accuracy=0
    average_F1score=0
    for result in results:
        average_recall+=result[0]/len(results)
        average_precision+=result[1]/len(results)
        average_accuracy+=result[2]/len(results)
        average_F1score+=result[3]/len(results)
    return {"Average recall":"{:.3%}".format(average_recall), "Average Precision":"{:.3%}".format(average_precision), "Average accuracy":"{:.3%}".format(average_accuracy), "Average F1 score":"{:.3%}".format(average_F1score)}

In [20]:
def test_classification_system(tests, all_words,tests_number=5):
    for t in tests:
        i=0
        av_results=[]
        while i<tests_number:
            av_results.append(run_Naive_Bayes_classification_system(t,all_words))    
            i=i+1
        print({str(t)+" top-frequent words":count_average_results(av_results), "Tests quantity":tests_number})
           
        

In [14]:
documents,words = create_documents()
all_words=words

2000
(['happy', 'bastard', 'quick', 'movie', 'review', 'damn', 'bug', 'got', 'head', 'start', 'movie', 'starring', 'jamie', 'lee', 'curtis', 'baldwin', 'brother', 'william', 'time', 'story', 'crew', 'tugboat', 'come', 'deserted', 'russian', 'tech', 'ship', 'strangeness', 'kick', 'power', 'little', 'know', 'power', 'going', 'gore', 'bringing', 'action', 'sequence', 'virus', 'feel', 'like', 'movie', 'going', 'flash', 'substance', 'don', 'know', 'crew', 'middle', 'don', 'know', 'origin', 'took', 'ship', 'big', 'pink', 'flashy', 'thing', 'hit', 'mir', 'course', 'don', 'know', 'donald', 'sutherland', 'stumbling', 'drunkenly', 'hey', 'let', 'chase', 'people', 'robot', 'acting', 'average', 'like', 'curtis', 'likely', 'kick', 'work', 'halloween', 'sutherland', 'wasted', 'baldwin', 'acting', 'like', 'baldwin', 'course', 'real', 'star', 'stan', 'winston', 'robot', 'design', 'schnazzy', 'cgi', 'occasional', 'good', 'gore', 'shot', 'like', 'picking', 'brain', 'robot', 'body', 'turn', 'movie', 'pre

In [15]:
test_classification_system([3000],all_words,tests_number=1)

34261
('film', 11053)
('movie', 6977)
('character', 3879)
('like', 3789)
('time', 2979)
('scene', 2671)
('good', 2429)
('story', 2345)
('life', 1913)
('way', 1882)
('year', 1732)
('thing', 1661)
('doe', 1578)
('plot', 1574)
('come', 1510)
('little', 1505)
('know', 1494)
('people', 1470)
('man', 1404)
('bad', 1395)
('work', 1379)
('director', 1347)
('best', 1334)
('end', 1328)
('performance', 1317)
('don', 1304)
('new', 1292)
('look', 1278)
('doesn', 1277)
('action', 1260)
('actor', 1252)
('love', 1209)
('play', 1205)
('star', 1160)
('role', 1155)
('great', 1150)
('find', 1119)
('audience', 1079)
('big', 1064)
('world', 1061)
('want', 1037)
('day', 1024)
('think', 986)
('guy', 932)
('comedy', 928)
('better', 926)
('real', 915)
('seen', 910)
('going', 901)
('old', 887)
('isn', 871)
('fact', 853)
('set', 851)
('point', 848)
('funny', 840)
('actually', 837)
('long', 836)
('right', 834)
('minute', 831)
('woman', 831)
('effect', 822)
('lot', 814)
('script', 810)
('friend', 805)
('john', 802)

Experiments with different amount of top-frequent words for feature selection and training the Na—óve Bayes Classifier

In [21]:
verbose=0
test_classification_system([1000,2000,3000,4000,8000,10000,12000, 15000, 18000, 20000],all_words)

{'1000 top-frequent words': {'Average recall': '78.667%', 'Average Precision': '68.316%', 'Average accuracy': '75.400%', 'Average F1 score': '73.054%'}, 'Tests quantity': 5}
{'2000 top-frequent words': {'Average recall': '79.350%', 'Average Precision': '76.105%', 'Average accuracy': '78.900%', 'Average F1 score': '77.578%'}, 'Tests quantity': 5}
{'3000 top-frequent words': {'Average recall': '85.685%', 'Average Precision': '72.864%', 'Average accuracy': '79.700%', 'Average F1 score': '78.690%'}, 'Tests quantity': 5}
{'4000 top-frequent words': {'Average recall': '79.783%', 'Average Precision': '73.343%', 'Average accuracy': '77.500%', 'Average F1 score': '76.409%'}, 'Tests quantity': 5}
{'8000 top-frequent words': {'Average recall': '84.901%', 'Average Precision': '74.389%', 'Average accuracy': '80.800%', 'Average F1 score': '79.277%'}, 'Tests quantity': 5}
{'10000 top-frequent words': {'Average recall': '83.237%', 'Average Precision': '72.804%', 'Average accuracy': '78.700%', 'Average