In [179]:
import random
import re

In [180]:
PATH_TO_POS = "../text/twitter-datasets/train_pos.txt"
PATH_TO_NEG = "../text/twitter-datasets/train_neg.txt"

In [181]:
proportion_train = 0.8

# Obtaining the data from the files

## General function to get data from a file for this project

In [182]:
def get_data_from_file(filename, proportion=None, isTraining=False, value=None, shuffle=False):
    """We assume here that (proportion != None) <=> local testing. 
    Please be sure to verify this before using the function.
    The shuffle part is only used for the local testing phase, 
    where selecting different subsets to train/validate our model can be of influence"""
    with open(filename, "r") as file:
        content = file.read()
        content_lines = content.split("\n")
        if shuffle:
            random.shuffle(content_lines)
        if proportion != None:
            # Here is the processing of training data during the local testing phase
            temp1_x = content_lines[:int(len(content_lines) * proportion)]
            temp1_y = [value] * int(len(content_lines) * proportion)
            temp2_x = content_lines[int(len(content_lines) * proportion):]
            temp2_y = [value] * int(len(content_lines) * (1-proportion))
            return temp1_x, temp1_y, temp2_x, temp2_y
        if isTraining:
            # Here is the processing of training data during the real prediction phase
            temp_x = content_lines[:]
            temp_y = [value] * len(content_lines)
            return temp_x, temp_y
        # Here is the processing of new data for the real prediction phase
        temp_ids = []
        temp_xs = []
        for i in range(len(content_lines)):
            if "," in content_lines[i]:
                entrySplitted = re.split(",", content_lines[i], 1)
                temp_ids.append(entrySplitted[0])
                temp_xs.append(entrySplitted[1])
        return temp_ids, temp_xs       

## Actual recuperation of the data

In [183]:
train_x = []
train_y = []
test_x = []
test_y = []

temp_train_x, temp_train_y, temp_test_x, temp_test_y = get_data_from_file(
    PATH_TO_POS, 
    proportion=proportion_train,
    isTraining=True,
    value=1,
    shuffle=False
)
train_x += temp_train_x
train_y += temp_train_y
test_x += temp_test_x
test_y += temp_test_y
temp_train_x, temp_train_y, temp_test_x, temp_test_y = get_data_from_file(
    PATH_TO_NEG, 
    proportion=proportion_train,
    isTraining=True,
    value=-1,
    shuffle=False
)
train_x += temp_train_x
train_y += temp_train_y
test_x += temp_test_x
test_y += temp_test_y


### Archive

Same thing as above, but without using a function. Can rely on this should the function introduce too much errors

In [184]:
# with open(PATH_TO_POS, "r") as file:
#     content = file.read()
#     content_lines = content.split("\n")
#     train_x += content_lines[:int(len(content_lines) * proportion_train)]
#     train_y += [1] * int(len(content_lines) * proportion_train)
#     test_x += content_lines[int(len(content_lines) * proportion_train):]
#     test_y += [1] * int(len(content_lines) * (1-proportion_train))

# with open(PATH_TO_NEG, "r") as file:
#     content = file.read()
#     content_lines = content.split("\n")
#     train_x += content_lines[:int(len(content_lines) * proportion_train)]
#     train_y += [-1] * int(len(content_lines) * proportion_train) 
#     test_x += content_lines[int(len(content_lines) * proportion_train):]
#     test_y += [-1] * int(len(content_lines) * (1-proportion_train))

# Constructing the tables with likelihood to be a positive or negative element

## For each n-gram

### First form the bigrams for each sentence (the sentence already split in a list of words)

In [185]:
def form_ngrams(words, n):
    ngrams = []
    number_ngrams = len(words)-n+1
    for i in range(number_ngrams):
        ngram = ""
        for j in range(n):
            ngram += words[i+j]
            if j != n-1:
                ngram += " "
        ngrams.append(ngram)
    return ngrams

### Build the table

In [186]:
def likelihood_table_constructor(table, xs, ys, n=2):
    table.clear()
    for x,y in zip(xs, ys):
        list_words = x.split(" ")
        list_ngrams = form_ngrams(list_words, n=n)
        for ngram in list_ngrams:
            if ngram not in table:
                table[ngram] = [3, 1, 1]
                if y == 1:
                    table[ngram][1] += 1
                else:
                    table[ngram][2] += 1
            else:
                table[ngram][0] += 1
                if y == 1:
                    table[ngram][1] += 1
                else:
                    table[ngram][2] += 1

### Use the previous function to build the table, with the correct parameters

In [187]:
words = {}
bigrams = {}
trigrams = {}

likelihood_table_constructor(
    words,
    train_x,
    train_y,
    n=1
)
likelihood_table_constructor(
    bigrams,
    train_x,
    train_y,
    n=2
)
likelihood_table_constructor(
    trigrams,
    train_x,
    train_y,
    n=3
)

## Archives

### Less generalized way to do than the above. If this is used, must use the archive way of classifiers too.

### Populate a words dictionary, without using the function. Can rely on this should the function introduce too much errors

In [188]:
# words = {}
# for x,y in zip(train_x, train_y):
#     list_words = x.split(" ")
#     for w in list_words:
#         if w not in words:
#             words[w] = [3, 1, 1]
#             if y == 1:
#                 words[w][1] += 1
#             else:
#                 words[w][2] += 1
#         else:
#             words[w][0] += 1
#             if y == 1:
#                 words[w][1] += 1
#             else:
#                 words[w][2] += 1

### Same as above, but for bigrams. Should also be covered by the function.

In [189]:
# bigrams = {}
# for x,y in zip(train_x, train_y):
#     list_words = x.split(" ")
#     list_bigrams = form_ngrams(list_words, n=2)
#     for b in list_bigrams:
#         if b not in bigrams:
#             bigrams[b] = [3, 1, 1]
#             if y == 1:
#                 bigrams[b][1] += 1
#             else:
#                 bigrams[b][2] += 1
#         else:
#             bigrams[b][0] += 1
#             if y == 1:
#                 bigrams[b][1] += 1
#             else:
#                 bigrams[b][2] += 1

# Basic classifiers for a sentence

## Classifier based on one type of n-gram only

In [190]:
def classifier_ngrams(sentence, table, n=2):
    """table parameter MUST have been generated using the same n.
    Please be sure to check for this before using the function."""
    proba_pos = 1
    proba_neg = 1
    list_words = sentence.split(" ")
    list_ngrams = form_ngrams(list_words, n=n)
    for ngram in list_ngrams:
        if ngram in table:
            proba_pos *= table[ngram][1]/table[ngram][0]
            proba_neg *= table[ngram][2]/table[ngram][0]
        else:
            proba_pos *= 0.5
            proba_neg *= 0.5
    if proba_pos >= proba_neg:
        return 1
    return -1

## Archives

Same thing as for the other archives, can rely on this should the functions crash

### 1. Classifier based on words only

In [191]:
# def classifier_words(sentence):
#     proba_pos = 1
#     proba_neg = 1
#     list_words = sentence.split(" ")
#     for w in list_words:
#         if w in words:
#             proba_pos *= words[w][1]/words[w][0]
#             proba_neg *= words[w][2]/words[w][0]
#         else:
#             proba_pos *= 0.5
#             proba_neg *= 0.5
#     if proba_pos >= proba_neg:
#         return 1
#     return -1

### 2. Classifier based on bigrams only

In [192]:
# def classifier_bigrams(sentence):
#     proba_pos = 1
#     proba_neg = 1
#     list_words = sentence.split(" ")
#     list_bigrams = form_ngrams(list_words, n=2)
#     for b in list_bigrams:
#         if b in bigrams:
#             proba_pos *= bigrams[b][1]/bigrams[b][0]
#             proba_neg *= bigrams[b][2]/bigrams[b][0]
#         else:
#             proba_pos *= 0.5
#             proba_neg *= 0.5
#     if proba_pos >= proba_neg:
#         return 1
#     return -1

# Test zone

## 1. Confusion Matrix

In [193]:
def get_confusion_matrix(xs, ys, classifier, table, n):
    matrix = [[0, 0], [0, 0]]
    for x,y in zip(xs, ys):
        if classifier(x, table, n=n) == y:
            if y == 1:
                matrix[0][0] += 1
            else:
                matrix[1][1] += 1
        else:
            if y == 1:
                matrix[1][0] += 1
            else:
                matrix[0][1] += 1
    return matrix

In [194]:
confusion_matrix_words = get_confusion_matrix(test_x, test_y, classifier_ngrams, words, 1)
confusion_matrix_bigrams = get_confusion_matrix(test_x, test_y, classifier_ngrams, bigrams, 2)
confusion_matrix_trigrams = get_confusion_matrix(test_x, test_y, classifier_ngrams, trigrams, 3)

print(confusion_matrix_words[0])
print(confusion_matrix_words[1])
print("")
print(confusion_matrix_bigrams[0])
print(confusion_matrix_bigrams[1])
print("")
print(confusion_matrix_trigrams[0])
print(confusion_matrix_trigrams[1])

[14932, 5346]
[5068, 14654]

[17293, 5319]
[2707, 14681]

[16605, 5843]
[3395, 14157]


## 2. F-score

In [195]:
def compute_precision(matrix):
    true_pos = matrix[0][0]
    false_pos = matrix[0][1]
    return true_pos / (true_pos+false_pos)
def compute_recall(matrix):
    true_pos = matrix[0][0]
    false_neg = matrix[1][0]
    return true_pos / (true_pos+false_neg)
def compute_fscore(matrix):
    p = compute_precision(matrix)
    r = compute_recall(matrix)
    return 2 * (p*r) / (p+r)

In [196]:
print("Words only: precision={:.3f}, recall={:.3f}, f-score={:.3f}".format(
    compute_precision(confusion_matrix_words),
    compute_recall(confusion_matrix_words),
    compute_fscore(confusion_matrix_words)
))
print("Bigrams only: precision={:.3f}, recall={:.3f}, f-score={:.3f}".format(
    compute_precision(confusion_matrix_bigrams),
    compute_recall(confusion_matrix_bigrams),
    compute_fscore(confusion_matrix_bigrams)
))
print("Trigrams only: precision={:.3f}, recall={:.3f}, f-score={:.3f}".format(
    compute_precision(confusion_matrix_trigrams),
    compute_recall(confusion_matrix_trigrams),
    compute_fscore(confusion_matrix_trigrams)
))

Words only: precision=0.736, recall=0.747, f-score=0.741
Bigrams only: precision=0.765, recall=0.865, f-score=0.812
Trigrams only: precision=0.740, recall=0.830, f-score=0.782
