In [124]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import confusion_matrix
import random
import torch
import torch.nn as nn
import hashlib
from tqdm import tqdm

In [125]:
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)

<torch._C.Generator at 0x20c927e3d30>

In [15]:
import tarfile
import requests

FOLDER = "datasets/"
PATH = "sentences.tar.bz2"
URL = "https://downloads.tatoeba.org/exports/" + PATH

def download_dataset():
    req = requests.get(URL, stream=True)

    if req.status_code != 200:
        print("Failed to download file, got status: " + req.status_code)
        req.close()
    else:
        with open(FOLDER + PATH, "wb") as fd:
            bytes = 0
            for chunk in req.iter_content(chunk_size=65536):
                fd.write(chunk)
                bytes += len(chunk)
                print(f"Downloading: {bytes} bytes written to {PATH}")
        print(f"{PATH} downloaded.")
        req.close()

def extract_dataset():
    with tarfile.open(FOLDER + PATH, "r:bz2") as tar:
        tar.extractall(FOLDER)
        print(f"{PATH} extracted to '{FOLDER}'.")


In [16]:
#download_dataset()
#extract_dataset()

Downloading: 65536 bytes written to sentences.tar.bz2
Downloading: 131072 bytes written to sentences.tar.bz2
Downloading: 196608 bytes written to sentences.tar.bz2
Downloading: 262144 bytes written to sentences.tar.bz2
Downloading: 327680 bytes written to sentences.tar.bz2
Downloading: 393216 bytes written to sentences.tar.bz2
Downloading: 458752 bytes written to sentences.tar.bz2
Downloading: 524288 bytes written to sentences.tar.bz2
Downloading: 589824 bytes written to sentences.tar.bz2
Downloading: 655360 bytes written to sentences.tar.bz2
Downloading: 720896 bytes written to sentences.tar.bz2
Downloading: 786432 bytes written to sentences.tar.bz2
Downloading: 851968 bytes written to sentences.tar.bz2
Downloading: 917504 bytes written to sentences.tar.bz2
Downloading: 983040 bytes written to sentences.tar.bz2
Downloading: 1048576 bytes written to sentences.tar.bz2
Downloading: 1114112 bytes written to sentences.tar.bz2
Downloading: 1179648 bytes written to sentences.tar.bz2
Download

In [111]:
dataset = open(FOLDER + 'sentences.csv', encoding='utf-8').read().strip().split('\n')

In [112]:
# Clean the data

dataset = list(map(lambda x: tuple(map(str.strip, x.split('\t'))), dataset))

In [113]:
from collections import Counter
counter = Counter(map(lambda x: x[1], dataset))

In [114]:
# Limit the amount of languages and data to use to limit computational costs

MAX_LANGS = 6
MAX_DOCS  = 50000

langs = [lang for (lang, _) in counter.most_common(MAX_LANGS)]

In [116]:
from collections import defaultdict

items_by_lang = defaultdict(list)

for item in dataset:
    lang = item[1]
    items_by_lang[lang].append(item)

dataset = [item for lang in langs for item in items_by_lang[lang][:MAX_DOCS]]

In [58]:
random.shuffle(dataset)

In [66]:
# N-grams to allow bag-of-words

def ngrams(sentence, n=1, lc=True):
    if lc:
        sentence = sentence.lower()
    length = len(sentence)

    return [sentence[i:i + n] for i in range(length - n + 1)] if n > 0 else []

In [69]:
def all_ngrams(sentence, max_ngram=3, lc=True):
    all_ngrams = []

    for i in range(1, max_ngram + 1):
        all_ngrams += [ngrams(sentence, n=i, lc=lc)]

    return all_ngrams

In [75]:
MAX_CHARS = 521
MAX_BIGRAMS = 1031
MAX_TRIGRAMS = 1031

In [76]:
# for debugging purposes

def reproducible_hash(string):

    h = hashlib.md5(string.encode("utf-8"), usedforsecurity=False)
    return int.from_bytes(h.digest()[0:8], 'big', signed=True)

In [77]:
MAXES = [MAX_CHARS, MAX_BIGRAMS, MAX_TRIGRAMS]

In [130]:
# use hash trick to limit space usage

def hash_ngrams(ngrams, modulos):
   #return [list(map(lambda x: reproducible_hash(x) % modulos[i], ngrams[i])) for i in range(len(modulos))]
   return [ [ reproducible_hash(string) % modulo for string in ngram ] for ngram, modulo in zip(ngrams, modulos) ]

In [127]:
# use relative frequencies instead of true embeddings (computationally costly)

def calc_rel_freq(codes):
    length = len(codes)
    counts = Counter(codes)

    for key in counts:
        counts[key] /= length

    return counts

In [131]:
MAX_SHIFT = []
for i in range(len(MAXES)):
    MAX_SHIFT += [sum(MAXES[:i])]

In [132]:
def shift_keys(dicts, MAX_SHIFT):
    return { key + shift: val for ngram, shift in zip(dicts, MAX_SHIFT) for (key, val) in ngram.items() }

In [133]:
def build_freq_dict(sentence, MAXES=MAXES, MAX_SHIFT=MAX_SHIFT):
    hngrams = hash_ngrams(all_ngrams(sentence), MAXES)
    fhcodes = map(calc_rel_freq, hngrams)
    return shift_keys(fhcodes, MAX_SHIFT)

In [134]:
dataset_num = []
for datapoint in tqdm(dataset):
    dataset_num += [list(datapoint) + [build_freq_dict(datapoint[2])]]

 61%|██████    | 182202/300000 [01:11<00:43, 2686.01it/s]

In [85]:
X_cat = [ dataset[3] for dataset in dataset_num ]

In [87]:
vectorizer = DictVectorizer(sparse=False)

X = vectorizer.fit_transform(X_cat)

In [88]:
y_cat = [ dataset[1] for dataset in dataset_num ]

In [89]:
y_symbols = set(y_cat)

idx2lang = {}
lang2idx = {}

for idx, lang in enumerate(y_symbols):
    idx2lang[idx]  = lang
    lang2idx[lang] = idx

In [91]:
y = [ lang2idx[lang] for lang in y_cat ]

In [101]:
training_examples = int(X.shape[0] * 0.8)

X_train = X[:training_examples, :]
y_train = y[:training_examples]

X_val = X[training_examples:, :]
y_val = y[training_examples:]

In [96]:
# very simple pytorch module, but good enough for task

class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()

        self.fc1 = nn.Linear(input_dim, 50)
        self.fc2 = nn.Linear(50, 6)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        return x   

In [123]:
input_dim = X.shape[1]
model = Model(input_dim)

In [100]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 
# NAdam probably not needed due to simplicity

In [122]:
X_train = torch.Tensor(X_train)
y_train = torch.LongTensor(y_train)

X_val = torch.Tensor(X_val)
y_val = torch.LongTensor(y_val)

In [106]:
from torch.utils.data import TensorDataset, DataLoader

dataset_tensor = TensorDataset(X_train, y_train)
dataloader = DataLoader(dataset_tensor, batch_size=32, shuffle=True)

In [107]:
model.train()

Model(
  (fc1): Linear(in_features=2230, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=6, bias=True)
)

In [108]:
for epoch in range(5):
    loss_train = 0

    for X_batch, y_batch in dataloader:
        optimizer.zero_grad()
        
        outputs = model(X_batch)
        loss = loss_fn(outputs, y_batch)
        
        loss.backward()
        optimizer.step()
        
        loss_train += loss.item()
    print(loss_train)

487.6147324163576
39.17166730403005
21.963762875254417
14.687231717943519
8.839342773388944


In [117]:
Y_val_pred_logits = model(X_val)

In [118]:
Y_val_pred_proba = torch.softmax(Y_val_pred_logits, dim = 1)

In [119]:
y_val_pred = torch.argmax(Y_val_pred_proba, dim = 1)

In [120]:
# Validate data...

print(classification_report(y_val, y_val_pred, target_names=y_symbols))
print('Micro F1:', f1_score(y_val, y_val_pred, average='micro'))
print('Macro F1', f1_score(y_val, y_val_pred, average='macro'))

              precision    recall  f1-score   support

         tur       1.00      1.00      1.00     10044
         eng       1.00      1.00      1.00      9927
         ita       1.00      1.00      1.00      9990
         rus       1.00      1.00      1.00      9932
         epo       1.00      1.00      1.00     10087
         kab       1.00      1.00      1.00     10020

    accuracy                           1.00     60000
   macro avg       1.00      1.00      1.00     60000
weighted avg       1.00      1.00      1.00     60000

Micro F1: 0.9985
Macro F1 0.9985017697777007


In [121]:
confusion_matrix(y_val, y_val_pred)

array([[10027,     6,     4,     0,     4,     3],
       [    2,  9912,     3,     0,     6,     4],
       [    1,     8,  9969,     0,     7,     5],
       [    0,     0,     1,  9931,     0,     0],
       [    4,     3,    10,     0, 10067,     3],
       [    6,     5,     3,     0,     2, 10004]], dtype=int64)