In [23]:
import itertools
import nltk
nltk.download('words')
from nltk.corpus import stopwords, words
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import numpy as np
import random
import re
from tqdm import tqdm

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\peleg\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
PATH_TO_POS = "../../text/twitter-datasets/train_pos.txt"
PATH_TO_NEG = "../../text/twitter-datasets/train_neg.txt"
proportion_train = 0.8

## Score computation

In [3]:
def get_confusion_matrix(ys, preds):
    matrix = [[0, 0], [0, 0]]
    for y,pred in zip(ys, preds):
        if pred == y:
            if y == 1:
                matrix[0][0] += 1
            else:
                matrix[1][1] += 1
        else:
            if y == 1:
                matrix[1][0] += 1
            else:
                matrix[0][1] += 1
    return matrix

In [4]:
def compute_accuracy(matrix):
    corr_guesses = matrix[0][0] + matrix[1][1]
    total_guesses = matrix[0][0] + matrix[0][1] + matrix[1][0] + matrix[1][1]
    return corr_guesses / total_guesses
def compute_precision(matrix):
    true_pos = matrix[0][0]
    false_pos = matrix[0][1]
    return true_pos / (true_pos+false_pos)
def compute_recall(matrix):
    true_pos = matrix[0][0]
    false_neg = matrix[1][0]
    return true_pos / (true_pos+false_neg)
def compute_fscore(matrix):
    p = compute_precision(matrix)
    r = compute_recall(matrix)
    return 2 * (p*r) / (p+r)

## Data retrieving

In [5]:
def get_data_from_file(filename, proportion=None, isTraining=False, value=None, shuffle=False):
    """We assume here that (proportion != None) <=> local testing. 
    Please be sure to verify this before using the function.
    The shuffle part is only used for the local testing phase, 
    where selecting different subsets to train/validate our model can be of influence"""
    with open(filename, "r") as file:
        content = file.read()
        content_lines = content.split("\n")
        if shuffle:
            random.shuffle(content_lines)
        if proportion != None:
            # Here is the processing of training data during the local testing phase
            temp1_x = content_lines[:int(len(content_lines) * proportion)]
            temp1_y = [value] * int(len(content_lines) * proportion)
            temp2_x = content_lines[int(len(content_lines) * proportion):]
            temp2_y = [value] * int(len(content_lines) * (1-proportion))
            return temp1_x, temp1_y, temp2_x, temp2_y
        if isTraining:
            # Here is the processing of training data during the real prediction phase
            temp_x = content_lines[:]
            temp_y = [value] * len(content_lines)
            return temp_x, temp_y
        # Here is the processing of new data for the real prediction phase
        temp_ids = []
        temp_xs = []
        for i in range(len(content_lines)):
            if "," in content_lines[i]:
                entrySplitted = re.split(",", content_lines[i], 1)
                temp_ids.append(entrySplitted[0])
                temp_xs.append(entrySplitted[1])
        return temp_ids, temp_xs 

## Preprocessing pipeline

In [6]:
correct_words = words.words()
stop_words = stopwords.words()
ps = PorterStemmer()

In [25]:
# def clean_sentence(val, depth=0):
#     """1. Remove non alphanumerical characters
#        2. Remove stop words
#        3. Error correction
#        4. Stemming"""
#     sentence = val
#     if depth > 0:
#         regex = re.compile('([^\s\w]|_)+')
#         sentence = regex.sub('', val).lower()
#         if depth > 1:
#             sentence = word_tokenize(sentence)
#             for word in list(sentence):
#                 if word in stop_words:
#                     sentence.remove(word)
#                 elif depth > 2:
#                     # words_list = [
#                     #     (jaccard_distance(set(ngrams(word, 2)), set(ngrams(w, 2))),w)
#                     #     for w in correct_words if w[0]==word[0]
#                     # ]
#                     try:
#                         word_best = sorted(words_list, key = lambda val:val[0])[0][1]
#                     except:
#                         word_best = word
#                     if depth > 3 :
#                         sentence[sentence.index(word)] = ps.stem(word_best)
#                     else:
#                         sentence[sentence.index(word)] = word_best
#             sentence = " ".join(sentence)
#     return sentence

In [24]:
def clean_sentence(val, steps=[0]):
    """
        0. Nothing
        1. Remove non alphanumerical characters
        2. Remove stop words
        3. Stemming
    """
    sentence = val
    if 0 in steps:
        return sentence
    if 1 in steps:
        regex = re.compile('([^\s\w]|_)+')
        sentence = regex.sub('', val).lower()
    sentence = word_tokenize(sentence)
    if 2 in steps:
        for word in sentence:
            if word in stop_words:
                sentence.remove(word)
            elif 3 in steps:
                sentence[sentence.index(word)] = ps.stem(word)
    elif 3 in steps:
        for word in sentence:
            sentence[sentence.index(word)] = ps.stem(word)
    sentence = " ".join(sentence)
    return sentence

In [26]:
def clean_data(data, depth=0):
    for i in tqdm(range(len(data))):
        data[i] = clean_sentence(data[i], depth)
    return data

## N-gram model

In [9]:
def form_ngrams(words, n):
    ngrams = []
    number_ngrams = len(words)-n+1
    for i in range(number_ngrams):
        ngram = ""
        for j in range(n):
            ngram += words[i+j]
            if j != n-1:
                ngram += " "
        ngrams.append(ngram)
    return ngrams

In [10]:
def likelihood_ngrams_table_constructor(xs, ys, n=2):
    table = {}
    for x,y in zip(xs, ys):
        list_words = x.split(" ")
        list_ngrams = form_ngrams(list_words, n=n)
        for ngram in list_ngrams:
            if ngram not in table:
                table[ngram] = [3, 1, 1]
                if y == 1:
                    table[ngram][1] += 1
                else:
                    table[ngram][2] += 1
            else:
                table[ngram][0] += 1
                if y == 1:
                    table[ngram][1] += 1
                else:
                    table[ngram][2] += 1
    return table

In [11]:
def get_proba_ngram(sentence, table, n):
    """Please make sure that the table have been generated with the same n."""
    proba_pos = 1
    proba_neg = 1
    list_words = sentence.split(" ")
    list_ngrams = form_ngrams(list_words, n=n)
    for ngram in list_ngrams:
        if ngram in table:
            proba_pos *= table[ngram][1]/table[ngram][0]
            proba_neg *= table[ngram][2]/table[ngram][0]
        else:
            proba_pos *= 0.5
            proba_neg *= 0.5
    return proba_pos, proba_neg

In [12]:
def get_data_from_sentences(sentences, ns, tables, classes=None):
    xs = []
    ys = []
    if classes == None:
        for sentence in sentences:
            probas = []
            for n in ns:
                probas += get_proba_ngram(sentence, tables[n], n=n)
            xs.append(probas)
        return np.array(xs)
    for sentence,y in zip(sentences, classes):
        probas = []
        for n in ns:
            probas += get_proba_ngram(sentence, tables[n], n=n)
        xs.append(probas)
        ys.append(y)
    return np.array(xs), np.array(ys)

## Entries formatting

In [28]:
train_sentences_x_raw = []
train_sentences_y = []
test_sentences_x_raw = []
test_y = []

temp_train_x, temp_train_y, temp_test_x, temp_test_y = get_data_from_file(
    PATH_TO_POS, 
    proportion=proportion_train,
    isTraining=True,
    value=1,
    shuffle=False
)
train_sentences_x_raw += temp_train_x
train_sentences_y += temp_train_y
test_sentences_x_raw += temp_test_x
test_y += temp_test_y
temp_train_x, temp_train_y, temp_test_x, temp_test_y = get_data_from_file(
    PATH_TO_NEG, 
    proportion=proportion_train,
    isTraining=True,
    value=0,
    shuffle=False
)
train_sentences_x_raw += temp_train_x
train_sentences_y += temp_train_y
test_sentences_x_raw += temp_test_x
test_y += temp_test_y

## Inputs computing

In [32]:
steps_pool = [0, 1, 2, 3]
steps =[]
for l in range(0, len(steps_pool)+1):
    for subset in itertools.combinations(steps_pool, l):
        steps.append(subset)
for subset in list(steps):
    if (0 in subset and len(subset)>1) or len(subset) == 0:
        steps.remove(subset)
print(steps)

[(0,), (1,), (2,), (3,), (1, 2), (1, 3), (2, 3), (1, 2, 3)]


In [33]:
likelihood_tables = {}

for i,curr_steps in enumerate(steps):
    likelihood_tables[i] = likelihood_ngrams_table_constructor(
        clean_data(train_sentences_x_raw[:], curr_steps),
        train_sentences_y,
        n=2
    )

100%|██████████| 160000/160000 [00:00<00:00, 2342540.43it/s]
100%|██████████| 160000/160000 [00:17<00:00, 9054.52it/s] 
100%|██████████| 160000/160000 [03:09<00:00, 844.04it/s] 
100%|██████████| 160000/160000 [00:56<00:00, 2836.27it/s]
100%|██████████| 160000/160000 [02:03<00:00, 1296.40it/s]
100%|██████████| 160000/160000 [00:47<00:00, 3377.36it/s]
100%|██████████| 160000/160000 [03:33<00:00, 747.85it/s]
100%|██████████| 160000/160000 [02:26<00:00, 1089.57it/s]


In [34]:
test_datas_x = {}

for i,curr_steps in enumerate(steps):
    test_datas_x[i] = get_data_from_sentences(
        clean_data(test_sentences_x_raw[:], curr_steps),
        ns = [2],
        tables = {2: likelihood_tables[i]}
    )

100%|██████████| 40002/40002 [00:00<00:00, 1581567.13it/s]
100%|██████████| 40002/40002 [00:04<00:00, 9261.92it/s]
100%|██████████| 40002/40002 [00:46<00:00, 856.66it/s] 
100%|██████████| 40002/40002 [00:13<00:00, 2952.20it/s]
100%|██████████| 40002/40002 [00:31<00:00, 1279.04it/s]
100%|██████████| 40002/40002 [00:11<00:00, 3371.46it/s]
100%|██████████| 40002/40002 [00:55<00:00, 724.44it/s]
100%|██████████| 40002/40002 [00:36<00:00, 1086.47it/s]


In [35]:
test_data_y = np.array(test_y)

## Evaluator

In [36]:
def evaluator_bigram(xs):
    preds = []
    for x in xs:
        probaPos = x[0]
        probaNeg = x[1]
        if probaPos>probaNeg:
            preds.append(1)
        else:
            preds.append(0)
    return np.array(preds)

## Evaluation

In [37]:
preds = {}

for d in range(len(steps)):
    preds[d] = evaluator_bigram(test_datas_x[d])

## Results

In [39]:
print(
    "0: No preprocessing\n"
    "1: Non alphanumeric characters removal\n"
    "2: 1 + Stop words removal\n"
    "3: 2 + Stemming"
)

for i,curr_steps in enumerate(steps):
    matrix = get_confusion_matrix(test_data_y, preds[i])
    print(
        ("Depth {}\n"
        # "{}\n"
        # "{}\n"
        "acc={:.3f}, prec={:.3f}, rec={:.3f}, fscore={:.3f}").format(
            curr_steps,
            # matrix[0],
            # matrix[1],
            compute_accuracy(matrix),
            compute_precision(matrix),
            compute_recall(matrix),
            compute_fscore(matrix)
        )
    )

0: No preprocessing
1: Non alphanumeric characters removal
2: 1 + Stop words removal
3: 2 + Stemming
Depth (0,)
acc=0.799, prec=0.766, rec=0.860, fscore=0.810
Depth (1,)
acc=0.789, prec=0.763, rec=0.837, fscore=0.798
Depth (2,)
acc=0.766, prec=0.720, rec=0.869, fscore=0.788
Depth (3,)
acc=0.785, prec=0.740, rec=0.879, fscore=0.804
Depth (1, 2)
acc=0.758, prec=0.743, rec=0.791, fscore=0.766
Depth (1, 3)
acc=0.788, prec=0.764, rec=0.834, fscore=0.798
Depth (2, 3)
acc=0.765, prec=0.720, rec=0.867, fscore=0.787
Depth (1, 2, 3)
acc=0.756, prec=0.740, rec=0.790, fscore=0.764
