In [1]:
import math
from matplotlib import pyplot as plt
import numpy as np
import random
import re
from tqdm import tqdm

In [2]:
PATH_TO_POS = "../text/twitter-datasets/train_pos_full.txt"
PATH_TO_NEG = "../text/twitter-datasets/train_neg_full.txt"
PATH_TO_TEST = "../text/twitter-datasets/test_data.txt"
PATH_TO_SUB = "./submission.csv"

# Obtaining the data from the files

## General function to get data from a file for this project

In [3]:
def get_data_from_file(filename, proportion=None, isTraining=False, value=None, shuffle=False):
    """We assume here that (proportion != None) <=> local testing. 
    Please be sure to verify this before using the function.
    The shuffle part is only used for the local testing phase, 
    where selecting different subsets to train/validate our model can be of influence"""
    with open(filename, "r") as file:
        content = file.read()
        content_lines = content.split("\n")
        if shuffle:
            random.shuffle(content_lines)
        if proportion != None:
            # Here is the processing of training data during the local testing phase
            temp1_x = content_lines[:int(len(content_lines) * proportion)]
            temp1_y = [value] * int(len(content_lines) * proportion)
            temp2_x = content_lines[int(len(content_lines) * proportion):]
            temp2_y = [value] * int(len(content_lines) * (1-proportion))
            return temp1_x, temp1_y, temp2_x, temp2_y
        if isTraining:
            # Here is the processing of training data during the real prediction phase
            temp_x = content_lines[:]
            temp_y = [value] * len(content_lines)
            return temp_x, temp_y
        # Here is the processing of new data for the real prediction phase
        temp_ids = []
        temp_xs = []
        for i in range(len(content_lines)):
            if "," in content_lines[i]:
                entrySplitted = re.split(",", content_lines[i], 1)
                temp_ids.append(entrySplitted[0])
                temp_xs.append(entrySplitted[1])
        return temp_ids, temp_xs        

## Actual recuperation of the data

In [4]:
train_x = []
train_y = []

temp_train_x, temp_train_y = get_data_from_file(
    PATH_TO_POS,
    isTraining=True,
    value=1,
    shuffle=False
)
train_x += temp_train_x
train_y += temp_train_y
temp_train_x, temp_train_y = get_data_from_file(
    PATH_TO_NEG,
    isTraining=True,
    value=-1,
    shuffle=False
)
train_x += temp_train_x
train_y += temp_train_y


# Constructing the tables with likelihood to be a positive or negative element

## For each n-gram

### First form the bigrams for each sentence (the sentence already split in a list of words)

In [5]:
def form_ngrams(words, n):
    ngrams = []
    number_ngrams = len(words)-n+1
    for i in range(number_ngrams):
        ngram = ""
        for j in range(n):
            ngram += words[i+j]
            if j != n-1:
                ngram += " "
        ngrams.append(ngram)
    return ngrams

### Build the table

In [6]:
def likelihood_table_constructor(xs, ys, n=2):
    table = {}
    for x,y in zip(xs, ys):
        list_words = x.split(" ")
        list_ngrams = form_ngrams(list_words, n=n)
        for ngram in list_ngrams:
            if ngram not in table:
                table[ngram] = [3, 1, 1]
                if y == 1:
                    table[ngram][1] += 1
                else:
                    table[ngram][2] += 1
            else:
                table[ngram][0] += 1
                if y == 1:
                    table[ngram][1] += 1
                else:
                    table[ngram][2] += 1
    return table

### Use the previous function to build the table, with the correct parameters

In [7]:
ns = [1, 2, 3]
tables = {}
for n in ns:
    tables[n] = likelihood_table_constructor(
        train_x,
        train_y,
        n=n
    )

# Basic classifiers for a sentence

## Classifier based on multiple ngrams

In [8]:
def classifier_ngrams_mean(sentence, tables, ns, coeffs):
    proba_poss = []
    proba_negs = []
    list_words = sentence.split(" ")
    for n in ns:
        proba_pos = 1
        proba_neg = 1
        list_ngrams = form_ngrams(list_words, n=n)
        for ngram in list_ngrams:
            if ngram in tables[n]:
                proba_pos *= tables[n][ngram][1]/tables[n][ngram][0]
                proba_neg *= tables[n][ngram][2]/tables[n][ngram][0]
            else:
                proba_pos *= 0.5
                proba_neg *= 0.5
        proba_poss.append(proba_pos)
        proba_negs.append(proba_neg)
    proba_pos = 0
    proba_neg = 0
    for ppos,pneg,coeff in zip(proba_poss, proba_negs, coeffs):
        proba_pos += coeff*ppos
        proba_neg += coeff*pneg
    if proba_pos >= proba_neg:
        return 1
    return -1

# Prediction and submission file creation

In [9]:
test_ids, test_xs = get_data_from_file(PATH_TO_TEST)

In [10]:
def predict_write(tables, ns, xs=test_xs, classifier=classifier_ngrams_mean):
    # Predictions
    test_preds = []
    for x in xs:
        test_preds.append(classifier(x, tables, ns, [(1-0.802)/2, 0.802, (1-0.802)/2]))

    # Writing
    with open(PATH_TO_SUB, "w") as file:
        file.write("Id,Prediction\n")
        for id,pred in zip(test_ids, test_preds):
            file.write("{},{}\n".format(id, pred))

In [11]:
predict_write(tables, ns)