In [1]:
import random
from collections import Counter
from pathlib import Path
from pprint import pprint
import pickle

import nltk
nltk.download('punkt', quiet=True, raise_on_error=True)
nltk.download('stopwords', quiet=True, raise_on_error=True)
nltk_stop_words = list(set(nltk.corpus.stopwords.words('english')))
nltk_porter_stemmer = nltk.stem.PorterStemmer()

# Files
ROOT_FOLDERPATH = Path.cwd().parent.parent
DATASET_FOLDERPATH = ROOT_FOLDERPATH / 'data' / 'raw'
NEG_DATASET_FILEPATH = DATASET_FOLDERPATH / 'rt-polarity.neg'
POS_DATASET_FILEPATH = DATASET_FOLDERPATH / 'rt-polarity.pos'
MODEL_FILEPATH = ROOT_FOLDERPATH / 'model' / 'poc-01--bag-of-words--naive-bayes.pickle'

# LABELS
POS = 'POS'
NEG = 'NEG'

## Load the data
##### Overview the file formatting

In [2]:
linecount = 5
with NEG_DATASET_FILEPATH.open() as dataset:
    head = [next(dataset) for x in range(linecount)]
    
print(*head, sep='\n')

simplistic , silly and tedious . 

it's so laddish and juvenile , only teenage boys could possibly find it funny . 

exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 

[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 

a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 



##### Load the dataset to memory

In [3]:
with NEG_DATASET_FILEPATH.open() as dataset_file:
    corpus_neg = dataset_file.readlines()
    
with POS_DATASET_FILEPATH.open() as dataset_file:
    corpus_pos = dataset_file.readlines()
    
pprint(corpus_neg[:5])

['simplistic , silly and tedious . \n',
 "it's so laddish and juvenile , only teenage boys could possibly find it "
 'funny . \n',
 'exploitative and largely devoid of the depth or sophistication that would '
 'make watching such a graphic treatment of the crimes bearable . \n',
 '[garbus] discards the potential for pathological study , exhuming instead , '
 'the skewed melodrama of the circumstantial situation . \n',
 'a visually flashy but narratively opaque and emotionally vapid exercise in '
 'style and mystification . \n']


## Pre-process the data
##### Pre-process dataset for the Bag-of-Words approach

In [4]:
def preprocess_line_to_tokens(line):
    tokens = nltk.word_tokenize(line)
    tokens = (token for token in tokens if token.isalpha())
    tokens = (token for token in tokens if token not in nltk_stop_words)
    tokens = (nltk_porter_stemmer.stem(token) for token in tokens)
    return list(tokens)

def preprocess_corpus(corpus):
    return [preprocess_line_to_tokens(line) for line in corpus]

tokens_dataset_neg = preprocess_corpus(corpus_neg)
tokens_dataset_pos = preprocess_corpus(corpus_pos)

pprint(tokens_dataset_neg[:5])

[['simplist', 'silli', 'tediou'],
 ['laddish', 'juvenil', 'teenag', 'boy', 'could', 'possibl', 'find', 'funni'],
 ['exploit',
  'larg',
  'devoid',
  'depth',
  'sophist',
  'would',
  'make',
  'watch',
  'graphic',
  'treatment',
  'crime',
  'bearabl'],
 ['garbu',
  'discard',
  'potenti',
  'patholog',
  'studi',
  'exhum',
  'instead',
  'skew',
  'melodrama',
  'circumstanti',
  'situat'],
 ['visual',
  'flashi',
  'narr',
  'opaqu',
  'emot',
  'vapid',
  'exercis',
  'style',
  'mystif']]


##### Select the tokens used as features in the Bag-of-Words

In [5]:
feature_tokens_count = 3000
def get_all_tokens_from_dataset(tokens_dataset):
    return [word for line in tokens_dataset for word in line]  # Flatten a 2D array
tokens_neg = get_all_tokens_from_dataset(tokens_dataset_neg)
tokens_pos = get_all_tokens_from_dataset(tokens_dataset_pos)
all_tokens = tokens_neg + tokens_pos
feature_tokens = [t[0] for t in Counter(all_tokens).most_common(feature_tokens_count)]

print(feature_tokens[:10])

['film', 'movi', 'like', 'one', 'make', 'stori', 'charact', 'time', 'comedi', 'good']


##### Prepare the dataset to make it suitable for training a model

In [6]:
def tokens_to_model_input(tokens):
    model_input = {}
    for token in tokens:
        model_input[token] = (token in feature_tokens)
    return model_input

dataset = []
dataset.extend((tokens_to_model_input(tokens), POS) for tokens in tokens_dataset_pos)
dataset.extend((tokens_to_model_input(tokens), NEG) for tokens in tokens_dataset_neg)

random.seed(834)
random.shuffle(dataset)

training_testing_split_ratio = 0.80
split_index = int(training_testing_split_ratio * len(dataset))
training_dataset, testing_dataset = dataset[:split_index], dataset[split_index:]

pprint(training_dataset[:3])

[({'affect': True,
   'famili': True,
   'friendship': True,
   'medit': True,
   'realist': True,
   'warm': True},
  'POS'),
 ({'edit': True,
   'flick': True,
   'potenti': True,
   'pretenti': True,
   'ruin': True,
   'terrif': True},
  'NEG'),
 ({'els': True, 'hey': True, 'need': True, 'shower': True}, 'POS')]


## Train and Evaluate the model

##### Train

In [7]:
classifier = nltk.NaiveBayesClassifier.train(training_dataset)

classifier.show_most_informative_features(20)

with MODEL_FILEPATH.open('wb') as model_file:
    pickle.dump(classifier, model_file)

Most Informative Features
                    bore = True              NEG : POS    =     18.9 : 1.0
                 engross = True              POS : NEG    =     15.5 : 1.0
                   intim = True              POS : NEG    =     14.9 : 1.0
                  absorb = True              POS : NEG    =     14.2 : 1.0
                    wast = True              NEG : POS    =     13.1 : 1.0
               spielberg = True              POS : NEG    =     12.9 : 1.0
            refreshingli = True              POS : NEG    =     12.9 : 1.0
                    warm = True              POS : NEG    =     12.9 : 1.0
                    lame = True              NEG : POS    =     12.4 : 1.0
                 meander = True              NEG : POS    =     12.4 : 1.0
                  stupid = True              NEG : POS    =     11.9 : 1.0
                    dull = True              NEG : POS    =     11.8 : 1.0
                   urban = True              POS : NEG    =     11.6 : 1.0

##### Evaluate

In [8]:
test_accuracy = nltk.classify.accuracy(classifier, testing_dataset)
print(f"Test accuracy: {test_accuracy * 100:.2f}%")

Test accuracy: 75.71%


## Deploy the model
##### Example

In [9]:
example_line = "This movie was boring."

model_input = tokens_to_model_input(preprocess_line_to_tokens(example_line))
with MODEL_FILEPATH.open('rb') as model_file:
    classifier = pickle.load(model_file)
classifier.classify(model_input)

'NEG'