In [None]:
import random
from collections import Counter
from pathlib import Path
from pprint import pprint
import pickle

import nltk

# Paths for files
ROOT_FOLDERPATH = Path.cwd().parent.parent
DATASET_FOLDERPATH = ROOT_FOLDERPATH / 'data' / 'raw'
NEG_DATASET_FILEPATH = DATASET_FOLDERPATH / 'rt-polarity.neg'
POS_DATASET_FILEPATH = DATASET_FOLDERPATH / 'rt-polarity.pos'
MODEL_FILEPATH = ROOT_FOLDERPATH / 'model' / 'poc-01--bag-of-words--naive-bayes.pickle'

# Data labels
POS = 'POS'
NEG = 'NEG'

# Magic values
RANDOMNESS_SEED = 834

# Data preprocessing
##### Overview the formatting

In [None]:
linecount = 3
with NEG_DATASET_FILEPATH.open() as dataset:
    head = [next(dataset) for x in range(linecount)]
print(*head, sep='\n')

##### Load the dataset to memory

In [None]:
with NEG_DATASET_FILEPATH.open() as file:
    corpus_neg = file.readlines()
    
with POS_DATASET_FILEPATH.open() as file:
    corpus_pos = file.readlines()

##### Transform and clean data

In [None]:
nltk.download('stopwords', quiet=True, raise_on_error=True)
nltk_stop_words = list(set(nltk.corpus.stopwords.words('english')))
nltk.download('punkt', quiet=True, raise_on_error=True)
nltk_porter_stemmer = nltk.stem.PorterStemmer()

def line_to_preprocessed_tokens(line):
    tokens = nltk.word_tokenize(line)
    tokens = (token for token in tokens if token.isalpha())  # Remove words with non-alphabet characters
    tokens = (token for token in tokens if token not in nltk_stop_words)  # Remove stop words
    tokens = (nltk_porter_stemmer.stem(token) for token in tokens)  # Stemming
    return list(tokens)

# Smoke test
line_to_preprocessed_tokens('I think this movie is great !')

In [None]:
def preprocess_corpus(corpus):
    corpus = (line_to_preprocessed_tokens(line) for line in corpus)
    return [tokens for tokens in corpus if tokens]  # Remove empty lists

tokens_dataset_neg = preprocess_corpus(corpus_neg)
tokens_dataset_pos = preprocess_corpus(corpus_pos)

In [None]:
# Smoke test
print(*tokens_dataset_neg[0:3], sep='\n')

# Feature extraction
##### Select most frequent tokens as features

In [None]:
feature_tokens_count = 3000

def get_all_tokens_from_dataset(tokens_dataset):
    return [word for line in tokens_dataset for word in line]  # Flatten a 2D array

tokens_neg = get_all_tokens_from_dataset(tokens_dataset_neg)
tokens_pos = get_all_tokens_from_dataset(tokens_dataset_pos)

all_tokens = tokens_neg + tokens_pos
feature_tokens = [t[0] for t in Counter(all_tokens).most_common(feature_tokens_count)]

print(f"First 10 selected features ({len(feature_tokens)} in total):")
print(*[f"{id+1}. {a}" for id, a in enumerate(feature_tokens[:10])], sep='\n')

# Training 

### Prepare the input
##### Create a dataset with (x, y) pairs

In [None]:
def tokens_to_model_input(tokens):
    model_input = {}
    for token in tokens:
        model_input[token] = (token in feature_tokens)
    return model_input

dataset = []
dataset.extend((tokens_to_model_input(tokens), POS) for tokens in tokens_dataset_pos)
dataset.extend((tokens_to_model_input(tokens), NEG) for tokens in tokens_dataset_neg)

# Smoke test
print(first_data_element)

##### Shuffle the dataset

In [None]:
# Shuffle the dataset
random.seed(RANDOMNESS_SEED)
random.shuffle(dataset)

##### Split the dataset

In [None]:
training_testing_split_ratio = 0.80
split_index = int(training_testing_split_ratio * len(dataset))
training_dataset, testing_dataset = dataset[:split_index], dataset[split_index:]

print(f"Training dataset size: {len(training_dataset)}")
print(f"Testing dataset size: {len(testing_dataset)}")

### Run the training
##### Train the Naive Bayes Classifier

In [None]:
classifier = nltk.NaiveBayesClassifier.train(training_dataset)

##### Save the model

In [None]:
with MODEL_FILEPATH.open('wb') as file:
    pickle.dump(classifier, file)
    print("Saved.")

##### Show most informative features

In [None]:
classifier.show_most_informative_features(20)

# Performance evaluation

##### Test set accuracy

In [None]:
test_accuracy = nltk.classify.accuracy(classifier, testing_dataset)
print(f"Test set accuracy: {test_accuracy * 100:.2f}%")

# Deployment
##### Example prediction

In [None]:
input_text = "This movie was boring..."

def predict(input_text):
    model_input = tokens_to_model_input(line_to_preprocessed_tokens(input_text))
    with MODEL_FILEPATH.open('rb') as file:
        classifier = pickle.load(file)
    return classifier.classify(model_input)

print(f'Sentence "{input_text}" is classified as "{predict(input_text)}"')