In [130]:
import numpy as np
import pandas as pd
import random
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# Get the data

In [170]:
def get_data_from_file(filename, proportion=None, isTraining=False, value=None, shuffle=False):
    """We assume here that (proportion != None) <=> local testing. 
    Please be sure to verify this before using the function.
    The shuffle part is only used for the local testing phase, 
    where selecting different subsets to train/validate our model can be of influence"""
    with open(filename, "r") as file:
        content = file.read()
        content_lines = content.split("\n")
        for i in range(len(content_lines)):
            if content_lines[len(content_lines)-1-i] == "" or content_lines[len(content_lines)-1-i] == " ":
                del content_lines[len(content_lines)-1-i]
        if shuffle:
            random.shuffle(content_lines)
        if proportion != None:
            # Here is the processing of training data during the local testing phase
            temp1_x = content_lines[:int(len(content_lines) * proportion)]
            temp1_y = [value] * int(len(content_lines) * proportion)
            temp2_x = content_lines[int(len(content_lines) * proportion):]
            temp2_y = [value] * (int(len(content_lines) * (1-proportion)) + 1)
            return temp1_x, temp1_y, temp2_x, temp2_y
        if isTraining:
            # Here is the processing of training data during the real prediction phase
            temp_x = content_lines[:]
            temp_y = [value] * len(content_lines)
            return temp_x, temp_y
        # Here is the processing of new data for the real prediction phase
        temp_ids = []
        temp_xs = []
        for i in range(len(content_lines)):
            if "," in content_lines[i]:
                entrySplitted = re.split(",", content_lines[i], 1)
                temp_ids.append(entrySplitted[0])
                temp_xs.append(entrySplitted[1])
        return temp_ids, temp_xs        

In [171]:
PATH_TO_POS = "../../text/twitter-datasets/train_pos.txt"
PATH_TO_NEG = "../../text/twitter-datasets/train_neg.txt"
PATH_TO_TEST = "../../text/twitter-datasets/test_data.txt"
PATH_TO_SUB = "./submission.csv"

In [172]:
proportion_train = 0.8

In [173]:
train_x = []
train_y = []
test_x = []
test_y = []

temp_train_x, temp_train_y, temp_test_x, temp_test_y = get_data_from_file(
    PATH_TO_POS, 
    proportion=proportion_train,
    isTraining=True,
    value=1,
    shuffle=False
)
train_x += temp_train_x
train_y += temp_train_y
test_x += temp_test_x
test_y += temp_test_y
print("{} ?= {}".format(
    len(test_x),
    len(test_y)
))
temp_train_x, temp_train_y, temp_test_x, temp_test_y = get_data_from_file(
    PATH_TO_NEG, 
    proportion=proportion_train,
    isTraining=True,
    value=0,
    shuffle=False
)
train_x += temp_train_x
train_y += temp_train_y
test_x += temp_test_x
test_y += temp_test_y
print("{} ?= {}".format(
    len(test_x),
    len(test_y)
))

20000 ?= 19999
40000 ?= 39998


# Get the probas

## Formatting the inputs dictionary

In [135]:
ns = [1, 2, 3]          # We use n-grams with 1<=n<=3

In [136]:
def form_ngrams(words, n):
    ngrams = []
    number_ngrams = len(words)-n+1
    for i in range(number_ngrams):
        ngram = ""
        for j in range(n):
            ngram += words[i+j]
            if j != n-1:
                ngram += " "
        ngrams.append(ngram)
    return ngrams

In [137]:
def likelihood_ngrams_table_constructor(xs, ys):
    table = {}
    for x,y in zip(xs, ys):
        for ngram in x:
            if ngram not in table:
                table[ngram] = [3, 1, 1]
                if y == 1:
                    table[ngram][1] += 1
                else:
                    table[ngram][2] += 1
            else:
                table[ngram][0] += 1
                if y == 1:
                    table[ngram][1] += 1
                else:
                    table[ngram][2] += 1
    return table

In [138]:
inputs_ngrams = {
    "train":{
        "y": train_y
    },
    "test":{
        "y": test_y
    }
}

for n in ns:
    inputs_ngrams["train"][n] = []
    inputs_ngrams["test"][n] = []
    for entry in train_x:
        inputs_ngrams["train"][n].append(form_ngrams(entry.split(" "), n))
    for entry in test_x:
        inputs_ngrams["test"][n].append(form_ngrams(entry.split(" "), n))

## Computing the tables

In [139]:
tables = {}
for n in ns:
    tables[n] = likelihood_ngrams_table_constructor(
        inputs_ngrams["train"][n],
        inputs_ngrams["train"]["y"]
    )

## Computing the probas

In [140]:
def get_proba_ngram(list_ngrams, table):
    """Please make sure that the table have been generated with the same n."""
    proba_pos = 1
    proba_neg = 1
    for ngram in list_ngrams:
        if ngram in table:
            proba_pos *= table[ngram][1]/table[ngram][0]
            proba_neg *= table[ngram][2]/table[ngram][0]
        else:
            proba_pos *= 0.5
            proba_neg *= 0.5
    return proba_pos, proba_neg

In [141]:
inputs_probas = {
    "train":[],
    "test":[],
    "y":{
        "train":[],
        "test":[]
    }
}
for n in ns:
    for i,entry in enumerate(inputs_ngrams["train"][n]):
        if n == ns[0]:
            inputs_probas["train"].append([])
        temp = get_proba_ngram(inputs_ngrams["train"][n][i], tables[n])
        inputs_probas["train"][i].append(temp[0])
        inputs_probas["train"][i].append(temp[1])
    for i,entry in enumerate(inputs_ngrams["test"][n]):
        if n == ns[0]:
            inputs_probas["test"].append([])
        temp = get_proba_ngram(inputs_ngrams["test"][n][i], tables[n])
        inputs_probas["test"][i].append(temp[0])
        inputs_probas["test"][i].append(temp[1])
for i,y in enumerate(inputs_ngrams["train"]["y"]):
    inputs_probas["y"]["train"].append(y)
for i,y in enumerate(inputs_ngrams["test"]["y"]):
    inputs_probas["y"]["test"].append(y)

# Form the datasets

In [142]:
X_train = pd.DataFrame(data=inputs_probas["train"])
y_train = pd.Series(data=inputs_probas["y"]["train"])
X_test = pd.DataFrame(data=inputs_probas["test"])
y_test = pd.Series(data=inputs_probas["y"]["test"])

# Neural Network

## Create

### Rescaling data

In [143]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Hyperparameters

In [144]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

### Data classes

In [145]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = TrainData(
    torch.FloatTensor(X_train), 
    torch.FloatTensor(y_train)
)

In [146]:
class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = TestData(torch.FloatTensor(X_test))

In [147]:
train_loader = DataLoader(
    dataset=train_data, 
    batch_size=BATCH_SIZE, 
    shuffle=True
)
test_loader = DataLoader(dataset=test_data, batch_size=1)

### Neural Network class

In [148]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()        # Number of input features is 6.
        self.layer_1 = nn.Linear(6, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [149]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [150]:
model = BinaryClassification()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

BinaryClassification(
  (layer_1): Linear(in_features=6, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


### Accuracy function

In [151]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

## Train

In [152]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.24543 | Acc: 87.742
Epoch 002: | Loss: 0.21866 | Acc: 89.431
Epoch 003: | Loss: 0.18534 | Acc: 91.350
Epoch 004: | Loss: 0.17793 | Acc: 91.617
Epoch 005: | Loss: 0.17353 | Acc: 91.839
Epoch 006: | Loss: 0.17629 | Acc: 91.965
Epoch 007: | Loss: 0.16438 | Acc: 92.395
Epoch 008: | Loss: 0.16744 | Acc: 92.173
Epoch 009: | Loss: 0.17042 | Acc: 92.282
Epoch 010: | Loss: 0.17702 | Acc: 91.730
Epoch 011: | Loss: 0.16841 | Acc: 92.336
Epoch 012: | Loss: 0.16428 | Acc: 92.478
Epoch 013: | Loss: 0.16768 | Acc: 92.082
Epoch 014: | Loss: 0.15516 | Acc: 92.933
Epoch 015: | Loss: 0.16196 | Acc: 92.623
Epoch 016: | Loss: 0.24140 | Acc: 88.040
Epoch 017: | Loss: 0.22150 | Acc: 89.478
Epoch 018: | Loss: 0.20086 | Acc: 90.703
Epoch 019: | Loss: 0.23972 | Acc: 88.256
Epoch 020: | Loss: 0.24872 | Acc: 87.894
Epoch 021: | Loss: 0.24788 | Acc: 87.845
Epoch 022: | Loss: 0.23396 | Acc: 88.598
Epoch 023: | Loss: 0.24866 | Acc: 87.681
Epoch 024: | Loss: 0.25143 | Acc: 87.574
Epoch 025: | Los

# Testing zone

In [153]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [154]:
confusion_matrix(y_test, y_pred_list)

ValueError: Found input variables with inconsistent numbers of samples: [40000, 40002]

In [None]:
print(classification_report(y_test, y_pred_list))