#This notebook contains the CNN used by Team Phlyers for the RMI shared task at VarDial2020. It's an adaptation of the CNN presented in Butnaru and Ionescu (2019), the paper in which the MOROCO corpus was first presented.

#The first few blocks are needed to set up the directory.

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')



In [None]:
%cd /content/drive/My Drive/Colab Notebooks


#This block loads the data.

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
from collections import Counter
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset, IterableDataset
from sklearn.metrics import f1_score
import string 

all_categories = []
training = []

# Load the data and preprocess
for line in open('data/train.txt', encoding='utf-8', errors='ignore'):
    category = line.split('\t')[-1].rstrip().strip('\u202c')
    all_categories.append(category)
    sentence = line.split('\t')[0].replace('$NE$', '').lower()
    training.append(sentence)

# These are letters that appear more than 50 times in the corpus. The others are excluded.
all_letters = 'cumaspnetfârşidvoljgzţăbîxwhșțkyкуинсайдертябгqхоéàпвылшǎцáзфьмжщчãöü̦̆ю̧ȋэç'

n_letters = len(all_letters)
print('Characters:', all_letters)
print('# of characters:', len(all_letters))
print('# of sentences:', len(training))
print('# of labels:', len(set(all_categories)))

# Map the characters into a list of indeces, that you use to create the tensors
dic_letters = dict(zip(all_letters, range(1, n_letters+1)))


#Hyper-parameters

learning_rate = 0.0001
num_epochs = 20
batch_size = 10



# Helper functions

# Get the letter index
def letterToIndex(letter):
    if letter in dic_letters:
        return dic_letters[letter]
    return 0

# One-hot vectors

#Turns a single line into a tensor
def lineToTensor(line):
    tensor = torch.zeros(n_letters+1, 5000)
    for li, letter in enumerate(line[:5000]):
        tensor[letterToIndex(letter)][li] = 1 
    return tensor

#Turns an batch of lines into a batch of tensors
def linesToTensors(lines):
    tensor = torch.zeros(batch_size, n_letters+1, 5000)
    for batch, line in enumerate(lines):
      for li, letter in enumerate(line[:5000]):
          tensor[batch][letterToIndex(letter)][li] = 1 
    return tensor

#Turns categories it tensors
def categoriesToTensors(categories):
    labels = torch.zeros(len(categories),dtype=torch.long)
    for i, label in enumerate(categories):
      labels[i] = all_categories.index(label)
    return labels

#Turns tensors into labels
def categoryFromOutput(output):
    top_n, top_i = output.data.topk(1) # Tensor out of Variable with .data
    return all_categories[top_i[0]]



#This is a class based on the DataLoader class that we will use to load the data.

In [5]:
class MyClass(Dataset):
    def __init__(self, training, labels):
        self.training = training
        self.labels = labels

    def __len__(self):
        return len(self.training)

    def __getitem__(self, idx):
        return (self.training[idx], self.labels[idx])

dataset = MyClass(training, all_categories)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)


#This is the CNN adapted from Butnaru and Ionescu (2019).

In [None]:
######################################################################
# Creating the Network
# ====================


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Sequential(nn.Conv1d(77, 128, kernel_size=7), nn.Threshold(threshold=0.000001, value=0), nn.MaxPool1d(3, stride=3))
        self.conv2 = nn.Sequential(nn.Conv1d(128, 128, kernel_size=7), nn.Threshold(threshold=0.000001, value=0), nn.MaxPool1d(3, stride=3))
        self.conv3 = nn.Sequential(nn.Conv1d(128, 128, kernel_size=3), nn.Threshold(threshold=0.000001, value=0), nn.MaxPool1d(3, stride=3))
        self.fc1 = nn.Sequential(nn.Linear(23424, 1000), nn.Threshold(threshold=0.000001, value=0) ,nn.Dropout())
        #For these two layers they do not specify the size. This was fine-tuned by us.
        self.fc2 = nn.Sequential(nn.Linear(1000, 1000), nn.Threshold(threshold=0.000001, value=0) ,nn.Dropout())
        self.fc3 = nn.Linear(1000, 2)

    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.fc3(out)
        softmax = nn.Softmax(dim=1)

        return softmax(out)

model = Net()
model = model.to('cuda')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

print('The CNN is ready.')


#This is the training phase. 


In [None]:
total_step = len(training)
loss_list = []


for epoch in range(num_epochs):

  predicted_labels = []
  correct_labels = []

  for i, (sentences, category) in enumerate(dataloader):
    tensors = linesToTensors(sentences)
    tensors = tensors.to("cuda")
    labels = categoriesToTensors(category)
    labels = labels.to("cuda")

    outputs = model(tensors)
    outputs = outputs.to("cuda")

    loss = criterion(outputs, labels)
    loss_list.append(loss.item())

    # Backprop and perform Adam optimisation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Track the training accuracy
    total = labels.size(0)
    _, predicted = torch.max(outputs.data, 1)
    correct = (predicted == labels).sum().item()
    acc = correct / total

    predicted_labels.extend([int(label) for label in predicted])
    correct_labels.extend([int(label) for label in labels])

  print('Training. Epoch-', epoch, 'F-score:', f1_score(predicted_labels, correct_labels, average='macro'))


torch.save(model.state_dict(), 'model_saved')
  


#Results on the development set

In [None]:
from sklearn.metrics import f1_score
import string 

def categoryFromOutput(output):
    top_n, top_i = output.data.topk(1) # Tensor out of Variable with .data
    return all_categories[top_i[0]]

model.load_state_dict(torch.load('trained_cnn_model'))
model = model.to("cuda")

dev = []

# Load the data and preprocess
for line in open('data/dev-source.txt'):
    category = line.split('\t')[-1].rstrip()
    sentence = line.split('\t')[0].replace('$NE$', '').lower().replace('FOTO', '').replace('VIDEO','').replace('LIVE','')
    dev.append((sentence, category))

predicted = []
correct = []

total = len(dev)

for i, (sentence, category) in enumerate(dev):
    tensor = torch.reshape(lineToTensor(sentence), (1, n_letters+1, 5000))
    tensor = tensor.to("cuda")
    outputs = model(tensor)
    outputs = outputs.to("cpu")

    label = Variable(torch.LongTensor([all_categories.index(category)]))

    _, prediction = torch.max(outputs.data, 1)
    predicted.append(prediction)
    correct.append(label)

    if i % 1000 == 0:
        print(i, 'out of', total, 'F-score:', f1_score(predicted,correct, average="macro"))

print(f1_score(predicted,correct, average="macro"))