In [1]:
import random
from collections import Counter
from pathlib import Path
import sys

import dill
import nltk

ROOT_FOLDERPATH = Path.cwd().parent.parent
sys.path.append(str(ROOT_FOLDERPATH))
from src.preparation.load_data import load_raw_corpus_posneg

# Paths for files
MODEL_FILEPATH = ROOT_FOLDERPATH / 'model' / 'poc01.dill'

# Data labels
POS = 'POS'
NEG = 'NEG'
LABEL_NAMES = {
    POS: "Positive",
    NEG: "Negative"
}

# Magic values
RANDOMNESS_SEED = 834
DATASET_TEST_SPLIT_RATIO = 0.2

# Data preprocessing
##### Load the dataset to memory

In [2]:
corpus_pos, corpus_neg = load_raw_corpus_posneg()

##### Transform and clean data

In [3]:
nltk.download('stopwords', quiet=True, raise_on_error=True)
nltk_stop_words = list(set(nltk.corpus.stopwords.words('english')))
nltk.download('punkt', quiet=True, raise_on_error=True)
nltk_porter_stemmer = nltk.stem.PorterStemmer()

def line_to_preprocessed_tokens(line):
    tokens = nltk.word_tokenize(line)
    tokens = (token.lower() for token in tokens)  # Ensure lower-case characters
    # Remove words with non-alphabet characters
    tokens = (token for token in tokens if token.isalpha())
    tokens = (token for token in tokens if token not in nltk_stop_words)  # Remove stop words
    tokens = (nltk_porter_stemmer.stem(token) for token in tokens)  # Stemming
    return list(tokens)

# Show result
line_to_preprocessed_tokens('I think this movie is great !')

['think', 'movi', 'great']

In [4]:
def preprocess_corpus(corpus):
    corpus = (line_to_preprocessed_tokens(line) for line in corpus)
    return [tokens for tokens in corpus if tokens]  # Remove empty lists

tokens_dataset_neg = preprocess_corpus(corpus_neg)
tokens_dataset_pos = preprocess_corpus(corpus_pos)

In [5]:
# Show result
print(*tokens_dataset_neg[0:3], sep='\n')

['simplist', 'silli', 'tediou']
['laddish', 'juvenil', 'teenag', 'boy', 'could', 'possibl', 'find', 'funni']
['exploit', 'larg', 'devoid', 'depth', 'sophist', 'would', 'make', 'watch', 'graphic', 'treatment', 'crime', 'bearabl']


# Feature extraction
##### Select most frequent tokens as features

In [6]:
feature_tokens_count = 3000

def get_all_tokens_from_dataset(tokens_dataset):
    return [word for line in tokens_dataset for word in line]  # Flatten a 2D array

tokens_neg = get_all_tokens_from_dataset(tokens_dataset_neg)
tokens_pos = get_all_tokens_from_dataset(tokens_dataset_pos)

all_tokens = tokens_neg + tokens_pos
feature_tokens = [t[0] for t in Counter(all_tokens).most_common(feature_tokens_count)]

# Show result
print(f"First 10 selected features ({len(feature_tokens)} in total):")
print(*[f"{id+1}. {a}" for id, a in enumerate(feature_tokens[:10])], sep='\n')

First 10 selected features (3000 in total):
1. film
2. movi
3. like
4. one
5. make
6. stori
7. charact
8. time
9. comedi
10. good


# Training 

### Prepare the input
##### Create a dataset with (x, y) pairs

In [7]:
def tokens_to_model_input(tokens):
    model_input = {}
    for token in tokens:
        model_input[token] = (token in feature_tokens)
    return model_input

dataset = []
dataset.extend((tokens_to_model_input(tokens), POS) for tokens in tokens_dataset_pos)
dataset.extend((tokens_to_model_input(tokens), NEG) for tokens in tokens_dataset_neg)

# Show result
print(dataset[0])

({'rock': True, 'destin': True, 'centuri': True, 'new': True, 'conan': False, 'go': True, 'make': True, 'splash': True, 'even': True, 'greater': True, 'arnold': True, 'schwarzenegg': True, 'van': True, 'damm': False, 'steven': True, 'segal': False}, 'POS')


##### Shuffle the dataset

In [8]:
# Shuffle the dataset
random.seed(RANDOMNESS_SEED)
random.shuffle(dataset)

##### Split the dataset

In [9]:
test_split_ratio = 1 - DATASET_TEST_SPLIT_RATIO
split_index = int(test_split_ratio * len(dataset))
training_dataset, testing_dataset = dataset[:split_index], dataset[split_index:]

print(f"Training dataset size: {len(training_dataset)}")
print(f"Testing dataset size: {len(testing_dataset)}")

Training dataset size: 8524
Testing dataset size: 2132


### Run the training
##### Train the Naive Bayes Classifier

In [10]:
classifier = nltk.NaiveBayesClassifier.train(training_dataset)

##### Save the model

In [11]:
def text_to_model_input(input_text):
    return tokens_to_model_input(line_to_preprocessed_tokens(input_text))

with MODEL_FILEPATH.open('wb') as file:
    dill.dump({
        'classifier': classifier,
        'text_to_model_input': text_to_model_input
    }, file)
    print("Saved.")

Saved.


##### Show most informative features

In [12]:
classifier.show_most_informative_features(20)

Most Informative Features
                 mediocr = True              NEG : POS    =     15.8 : 1.0
                 refresh = True              POS : NEG    =     15.5 : 1.0
                    bore = True              NEG : POS    =     14.7 : 1.0
                   intim = True              POS : NEG    =     13.6 : 1.0
                    flat = True              NEG : POS    =     13.1 : 1.0
            refreshingli = True              POS : NEG    =     12.9 : 1.0
                  suppos = True              NEG : POS    =     12.4 : 1.0
                 realist = True              POS : NEG    =     12.2 : 1.0
                 engross = True              POS : NEG    =     12.2 : 1.0
                    dull = True              NEG : POS    =     12.0 : 1.0
                   appar = True              NEG : POS    =     11.8 : 1.0
                  stupid = True              NEG : POS    =     10.7 : 1.0
                   solid = True              POS : NEG    =     10.6 : 1.0

# Performance evaluation
##### Test set accuracy

In [13]:
test_accuracy = nltk.classify.accuracy(classifier, testing_dataset)
print(f"Test set accuracy: {test_accuracy * 100:.2f}%")

Test set accuracy: 76.59%


# Deployment
##### Example prediction

In [14]:
input_text = "This movie was boring..."

def predict(input_text):
    with MODEL_FILEPATH.open('rb') as file_:
        file_contents = dill.load(file_)
        
    classifier = file_contents['classifier']
    text_to_model_input = file_contents['text_to_model_input']
    
    model_input = text_to_model_input(input_text)
    return classifier.classify(model_input)

print(f'Sentence "{input_text}" is classified as {LABEL_NAMES[predict(input_text)]}')

Sentence "This movie was boring..." is classified as Negative
