In [1]:
import collections
import random
from collections import Counter

import nltk

from src import const
from src.io import get_corpus, Dataset, get_label_title
from src.processing import process_strings_to_token_lists, process_string_to_tokens

# Data preprocessing
##### Load the dataset to memory

In [2]:
corpus = get_corpus()

##### Transform and clean data

In [3]:
# Processed example
process_string_to_tokens('I think this movie is great !')

['think', 'movi', 'great']

In [4]:
TokensList = collections.namedtuple('Tokens', 'neg pos all')
tokens_list = TokensList(neg=process_strings_to_token_lists(corpus.neg),
                         pos=process_strings_to_token_lists(corpus.pos),
                         all=process_strings_to_token_lists(corpus.neg + corpus.pos))

In [5]:
# Show result
print(*tokens_list.neg[0:3], sep='\n')

['simplist', 'silli', 'tediou']
['laddish', 'juvenil', 'teenag', 'boy', 'could', 'possibl', 'find', 'funni']
['exploit', 'larg', 'devoid', 'depth', 'sophist', 'would', 'make', 'watch', 'graphic', 'treatment', 'crime', 'bearabl']


# Feature extraction
##### Select most frequent tokens as features

In [6]:
feature_tokens_count = 3000

def flatten_2d_array(matrix):
    return [element for vector in matrix for element in vector]

feature_tokens = [t[0] for t in Counter(flatten_2d_array(tokens_list.all)).most_common(feature_tokens_count)]

# Show result
print(f"First 10 selected features ({len(feature_tokens)} in total):")
print(*[f"{id_+1}. {a}" for id_, a in enumerate(feature_tokens[:10])], sep='\n')

First 10 selected features (3000 in total):
1. film
2. movi
3. like
4. one
5. make
6. stori
7. charact
8. time
9. comedi
10. good


# Training 

### Prepare the input
##### Create a dataset with (x, y) pairs

In [7]:
def tokens_to_model_input(tokens):
    model_input = {}
    for token in tokens:
        model_input[token] = (token in feature_tokens)
    return model_input

dataset_full = []
dataset_full.extend((tokens_to_model_input(tokens), get_label_title(const.LABELS.POS))
                    for tokens in tokens_list.pos)
dataset_full.extend((tokens_to_model_input(tokens), get_label_title(const.LABELS.NEG)) 
                    for tokens in tokens_list.neg)

# Show result
print(dataset_full[0])

({'rock': True, 'destin': True, 'centuri': True, 'new': True, 'conan': False, 'go': True, 'make': True, 'splash': True, 'even': True, 'greater': True, 'arnold': True, 'schwarzenegg': True, 'van': True, 'damm': False, 'steven': True, 'segal': False}, 'Positive')


##### Shuffle the dataset

In [8]:
# Shuffle the dataset
random.seed(const.RANDOMNESS_SEED)
random.shuffle(dataset_full)

##### Split the dataset

In [9]:
test_split_ratio = 1 - const.DATASET_TEST_SPLIT_RATIO
split_index = int(test_split_ratio * len(dataset_full))
dataset = Dataset(dataset_full[:split_index], dataset_full[split_index:])

print(f"Training dataset size: {len(dataset.trn)}")
print(f"Testing dataset size: {len(dataset.tst)}")

Training dataset size: 8524
Testing dataset size: 2132


### Run the training
##### Train the Naive Bayes Classifier

In [10]:
classifier = nltk.NaiveBayesClassifier.train(dataset.trn)

##### Show most informative features

In [11]:
classifier.show_most_informative_features(20)

Most Informative Features
                 mediocr = True           Negati : Positi =     15.8 : 1.0
                 refresh = True           Positi : Negati =     15.5 : 1.0
                    bore = True           Negati : Positi =     14.7 : 1.0
                   intim = True           Positi : Negati =     13.6 : 1.0
                    flat = True           Negati : Positi =     13.1 : 1.0
            refreshingli = True           Positi : Negati =     12.9 : 1.0
                  suppos = True           Negati : Positi =     12.4 : 1.0
                 realist = True           Positi : Negati =     12.2 : 1.0
                 engross = True           Positi : Negati =     12.2 : 1.0
                    dull = True           Negati : Positi =     12.0 : 1.0
                   appar = True           Negati : Positi =     11.8 : 1.0
                  stupid = True           Negati : Positi =     10.7 : 1.0
                   solid = True           Positi : Negati =     10.6 : 1.0

# Performance evaluation
##### Test set accuracy

In [12]:
test_accuracy = nltk.classify.accuracy(classifier, dataset.tst)
print(f"Test set accuracy: {test_accuracy * 100:.2f}%")

Test set accuracy: 76.59%


# Deployment
##### Example prediction

In [13]:
input_text = "This movie was boring..."

def predict(movie_review_string):
    model_input = tokens_to_model_input(process_string_to_tokens(movie_review_string))
    return classifier.classify(model_input)

predicted_label = predict(input_text)
print(f'Sentence "{input_text}" is classified as {predicted_label}')

Sentence "This movie was boring..." is classified as Negative
