In [2]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk.collocations as collocations
import re
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score
import pickle

In [3]:
f = open('./movie_reviews/train.tsv', 'r')
# loop over lines in the file and use the first limit of them
phrasedata = []
for line in f:
  # ignore the first line starting with Phrase and read all lines
  if (not line.startswith('Phrase')):
    # remove final end of line character
    line = line.strip()
    # each line has 4 items separated by tabs
    # ignore the phrase and sentence ids, and keep the phrase and sentiment
    phrasedata.append(line.split('\t')[2:4])
f.close()

In [4]:
import random

In [5]:
random.shuffle(phrasedata)
phraselist = phrasedata
for phrase in phraselist[:10]:
    print(phrase)

['with the Waldo Salt Screenwriting award', '3']
['of Chan', '2']
['the border collie is funny', '3']
['quite one', '2']
['plain dull', '2']
["that 's at the center of the story", '2']
['flailing around', '1']
["with Herrmann quietly suggesting the sadness and obsession beneath Hearst 's forced avuncular chortles", '2']
['turn this fairly parochial melodrama into something really rather special .', '4']
['overly familiar', '1']


In [5]:
#%pip install nltk

In [6]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alec\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))

In [8]:
for phrase in phrasedocs[:10]:
    print(phrase)

(['with', 'the', 'Waldo', 'Salt', 'Screenwriting', 'award'], 3)
(['of', 'Chan'], 2)
(['the', 'border', 'collie', 'is', 'funny'], 3)
(['quite', 'one'], 2)
(['plain', 'dull'], 2)
(['that', "'s", 'at', 'the', 'center', 'of', 'the', 'story'], 2)
(['flailing', 'around'], 1)
(['with', 'Herrmann', 'quietly', 'suggesting', 'the', 'sadness', 'and', 'obsession', 'beneath', 'Hearst', "'s", 'forced', 'avuncular', 'chortles'], 2)
(['turn', 'this', 'fairly', 'parochial', 'melodrama', 'into', 'something', 'really', 'rather', 'special', '.'], 4)
(['overly', 'familiar'], 1)


In [9]:
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
for phrase in docs[:10]:
    print(phrase)

(['with', 'the', 'waldo', 'salt', 'screenwriting', 'award'], 3)
(['of', 'chan'], 2)
(['the', 'border', 'collie', 'is', 'funny'], 3)
(['quite', 'one'], 2)
(['plain', 'dull'], 2)
(['that', "'s", 'at', 'the', 'center', 'of', 'the', 'story'], 2)
(['flailing', 'around'], 1)
(['with', 'herrmann', 'quietly', 'suggesting', 'the', 'sadness', 'and', 'obsession', 'beneath', 'hearst', "'s", 'forced', 'avuncular', 'chortles'], 2)
(['turn', 'this', 'fairly', 'parochial', 'melodrama', 'into', 'something', 'really', 'rather', 'special', '.'], 4)
(['overly', 'familiar'], 1)


In [10]:
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))

16539


In [11]:
word_items = all_words.most_common(1500)
word_features = [word for (word, count) in word_items]

In [12]:
def document_features(document, word_features):
    document_words = set(document)
    #we open a Pytnon dictionary instead of a list
    features = {}
    for word in word_features:
        #checking if the word from word_features matches a word in the document
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [13]:
len(docs) 

156060

In [28]:
import numpy as np
doc_chunks = np.array_split(np.array(docs), 100)

  


In [15]:
len(doc_chunks)

100

In [35]:
doc_chunks[0][0][0]

['urgently']

In [27]:
docs[0]

(['urgently'], 2)

In [29]:
%%time
feature_set_chunks = []
for chunk in doc_chunks:
    feature_set_chunks.append(
        [(document_features(chunk_sent[0], word_features), chunk_sent[1]) 
         for chunk_sent in chunk]
    )

Wall time: 2min 4s


In [36]:
len(feature_set_chunks)

100

In [38]:
feature_set_chunks[0][0]

({'contains(the)': False,
  'contains(,)': False,
  'contains(a)': False,
  'contains(of)': False,
  'contains(and)': False,
  'contains(to)': False,
  'contains(.)': False,
  "contains('s)": False,
  'contains(in)': False,
  'contains(is)': False,
  'contains(that)': False,
  'contains(it)': False,
  'contains(as)': False,
  'contains(with)': False,
  'contains(for)': False,
  'contains(its)': False,
  'contains(film)': False,
  'contains(an)': False,
  'contains(movie)': False,
  'contains(this)': False,
  'contains(but)': False,
  'contains(be)': False,
  'contains(you)': False,
  'contains(on)': False,
  "contains(n't)": False,
  'contains(by)': False,
  'contains(more)': False,
  'contains(his)': False,
  'contains(about)': False,
  'contains(one)': False,
  'contains(``)': False,
  'contains(not)': False,
  'contains(at)': False,
  'contains(or)': False,
  'contains(from)': False,
  'contains(than)': False,
  'contains(--)': False,
  'contains(all)': False,
  'contains(have)': Fa

In [39]:
import itertools
# Check to see if we get all document featrues from original docs variable
len(list(itertools.chain.from_iterable(feature_set_chunks)))

156060

In [30]:
%%time
import itertools
featuresets = list(itertools.chain.from_iterable(feature_set_chunks))

Wall time: 12.8 s


In [31]:
try:
    del doc_chunks
except Exception as e:
    print(e)

In [32]:
try:
    del feature_set_chunks
    del all_words_list
    del phrasedata
    del phrasedocs
except Exception as e:
    print(e)

name 'all_words_list' is not defined


In [14]:
# featuresets = [(document_features(d, word_features), c) for (d, c) in docs]

MemoryError: 

In [42]:
len(featuresets)

156060

In [43]:
featuresets[0]

({'contains(the)': False,
  'contains(,)': False,
  'contains(a)': False,
  'contains(of)': False,
  'contains(and)': False,
  'contains(to)': False,
  'contains(.)': False,
  "contains('s)": False,
  'contains(in)': False,
  'contains(is)': False,
  'contains(that)': False,
  'contains(it)': False,
  'contains(as)': False,
  'contains(with)': False,
  'contains(for)': False,
  'contains(its)': False,
  'contains(film)': False,
  'contains(an)': False,
  'contains(movie)': False,
  'contains(this)': False,
  'contains(but)': False,
  'contains(be)': False,
  'contains(you)': False,
  'contains(on)': False,
  "contains(n't)": False,
  'contains(by)': False,
  'contains(more)': False,
  'contains(his)': False,
  'contains(about)': False,
  'contains(one)': False,
  'contains(``)': False,
  'contains(not)': False,
  'contains(at)': False,
  'contains(or)': False,
  'contains(from)': False,
  'contains(than)': False,
  'contains(--)': False,
  'contains(all)': False,
  'contains(have)': Fa

In [33]:
import numpy as np
from sklearn.model_selection import KFold

In [34]:
def kfold_naive_bayes(feature_sets, splits=5):
    kf = KFold(n_splits=splits)
    scores = []

    for train, test in kf.split(feature_sets):
        classifier = nltk.NaiveBayesClassifier.train(
            np.array(feature_sets)[train]
        )
        scores.append(
            nltk.classify.accuracy(classifier, 
                                   np.array(feature_sets)[test])
        )
        
    return classifier, scores

In [35]:
def score_model(true_labels, predictions):
    precision = precision_score(true_labels, predictions, average='macro').round(4)
    recall = recall_score(true_labels, predictions, average='macro').round(4)
    f_measure = f1_score(true_labels, predictions, average='macro').round(4)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F Measure: ", f_measure)
    return precision, recall, f_measure

In [52]:
%%time
uni_5fold_results = kfold_naive_bayes(featuresets, splits=5)

Wall time: 1h 5min 22s


In [1]:
np.mean(uni_5fold_results[1])

NameError: name 'np' is not defined

In [39]:
import sys
var_mems = []
local_vars = list(locals().items())
for var, obj in local_vars:
    if not var.startswith("_"):
        print(var, np.round(sys.getsizeof(obj) / 1e6, 4))
#         var_mems.append(var, np.round(sys.getsizeof(obj) / 1e6, 4))

In 0.0004
Out 0.0002
get_ipython 0.0001
exit 0.0001
quit 0.0001
pd 0.0001
nltk 0.0001
plt 0.0001
sns 0.0001
np 0.0001
collocations 0.0001
re 0.0001
KFold 0.0011
precision_score 0.0001
recall_score 0.0001
f1_score 0.0001
pickle 0.0001
f 0.0002
line 0.0001
random 0.0001
phraselist 1.3208
phrase 0.0001
stopwords 0.0001
tokens 0.0001
docs 1.3208
lowerphrase 0.0001
all_words 0.5899
word_items 0.013
word_features 0.013
document_features 0.0001
chunk 0.0001
kfold_naive_bayes 0.0001
score_model 0.0001
uni_5_fold_model 0.0001
preds 1.3208
trues 1.3208
itertools 0.0001
featuresets 1.2529
uni_5fold_preds_labs 0.0001
pred 0.0
true 0.0
uni_5fold_scores 0.0001
sys 0.0001
var_mems 0.0001


In [53]:
with open("./models/uni_model_5fold.pickle", 'wb') as f:
    pickle.dump(uni_5fold_results[0], f)

In [36]:
uni_5_fold_model = pickle.load(open("./models/uni_model_5fold.pickle", 'rb'))

In [37]:
%%time
uni_5fold_preds_labs = [(uni_5_fold_model.classify(features), label) for features, label in featuresets] 

Wall time: 33min 10s


In [38]:
preds = []
trues = []
for pred, true in uni_5fold_preds_labs:
    preds.append(pred)
    trues.append(true)

uni_5fold_preds_labs = [trues, preds]
uni_5fold_scores = score_model(uni_5fold_preds_labs[0], uni_5fold_preds_labs[1])

Precision:  0.4479
Recall:  0.4114
F Measure:  0.4173


In [45]:
# kf = KFold(n_splits = 5)
# sum = 0

# for train, test in kf.split(featuresets):
#     train_data = np.array(featuresets)[train]
#     test_data = np.array(featuresets)[test]
#     classifier = nltk.NaiveBayesClassifier.train(train_data)
#     sum += nltk.classify.accuracy(classifier, test_data)

# #storing the score in a variable 
# acc1 = sum/5
# print(acc1)

KeyboardInterrupt: 

In [None]:
from nltk.collocations import *
import re

#data cleaning and preprocessing
stopwords = nltk.corpus.stopwords.words('english')

def alpha(w):
    pattern = re.compile('^[^a-z]+$')
    if(pattern.match(w)):
        return True
    else:
        return False

#creating bigrams features for the corpus and applying cleaning steps
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_words)
finder.apply_word_filter(alpha)
finder.apply_word_filter(lambda w: w in stopwords)
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored[:10]

In [None]:
from nltk.collocations import *
import re

#data cleaning and preprocessing
stopwords = nltk.corpus.stopwords.words('english')

def alpha(w):
    pattern = re.compile('^[^a-z]+$')
    if(pattern.match(w)):
        return True
    else:
        return False

#creating bigrams features for the corpus and applying cleaning steps
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_words)
finder.apply_word_filter(alpha)
finder.apply_word_filter(lambda w: w in stopwords)
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored[:10]

In [None]:
#extracting clean bigrams (no frequency information)
bigram_features = [bigram for (bigram, count) in scored[:2000]]
#printing the first 30 for confirmation
bigram_features[:30]

In [None]:
def bi_document_features(document, bigram_features):
    document_words = list(nltk.bigrams(document))
    features = {}
    for word in bigram_features:
        #boolean logic will retunt 'True' if there is a match, or 'False' if not
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [None]:
#applying the function to our documents
featuresets2 = [(bi_document_features(d, bigram_features), c) for (d, c) in docs]

#seeing the featureset for the first document
featuresets2[0]

In [None]:
#checking the length of the featureset
#it should be the same than unigram, because we processed number of documents
#this is for verification
len(featuresets2)

In [None]:
#how we run the classifier to see if we get a good accuracy score
#all this is just to find out which feature is more informative to predict sentiment
#Naive Bayes classifier with 5-fold cross validation for training on sentiments using bigram features

kf = KFold(n_splits = 5)
sum = 0

for train, test in kf.split(featuresets2):
    train_data2 = np.array(featuresets2)[train]
    test_data2 = np.array(featuresets2)[test]
    classifier2 = nltk.NaiveBayesClassifier.train(train_data2)
    sum += nltk.classify.accuracy(classifier2, test_data2)

acc2 = sum/5
pritn(acc2)

In [None]:
# this function takes a document list of words and returns a feature dictionary
# it runs the default pos tagger (the Stanford tagger) on the document
#   and counts 4 types of pos tags to use as features
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [None]:
POS_featuresets = [(POS_features(d, word_features), c) for (d, c) in docs]
print(len(POS_featuresets[0][0].keys()))

In [None]:
#how we run the classifier to see if we get a good accuracy score
#all this is just to find out which feature is more informative to predict sentiment
#Naive Bayes classifier with 5-fold cross validation for training on sentiments using bigram features

kf = KFold(n_splits = 5)
sum = 0

for train, test in kf.split(POS_featuresets):
    train_data3 = np.array(POS_featuresets)[train]
    test_data3 = np.array(POS_featuresets)[test]
    classifier3 = nltk.NaiveBayesClassifier.train(train_data3)
    sum += nltk.classify.accuracy(classifier3, test_data3)

acc3 = sum/5
acc3