#Init Colab

In [None]:
!nvidia-smi

Sat Mar 19 14:21:28 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 511.65       Driver Version: 511.65       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:09:00.0  On |                  N/A |
| 30%   43C    P0    33W / 170W |    990MiB /  8192MiB |      2%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#Init

In [None]:
## for data
import os
import numpy as np

## for plotting
import matplotlib.pyplot as plt

## for processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

## for word embedding
import gensim
import gensim.downloader as gensim_api

## for model 
import torch
import torch.nn as nn

##Other
import time
import random
import json
from typing import Dict
from torch.utils.data import Dataset, DataLoader

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\orlan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\orlan\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\orlan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\orlan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\orlan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\orlan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already u

In [None]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
gensim_vocab_name = "glove-wiki-gigaword-50"

In [None]:
loaded_word_vocab = gensim.downloader.load(gensim_vocab_name)

In [None]:
def seed_all(seed: int = 42):
    print("[ Using Seed : ", seed, " ]")

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
seed_all()

[ Using Seed :  42  ]


In [None]:
temp_path = "."
out_path = os.path.join(temp_path, "out")
data_path = os.path.join(temp_path, "data")
model_p = temp_path
train_path = os.path.join(data_path, "train.jsonl")
dev_path = os.path.join(data_path, "dev.jsonl")
test_path = os.path.join(data_path, "test.jsonl")

# WordClassificationDataset

In [None]:
class WCDataset(Dataset):

    def __init__(self,
                 input_file: str,
                 vocab,
                 pos_vocab,
                 sw_filter=False,
                 lemming=True,
                 lowercase=True,
                 max_len=100,
                 device="cpu",
                 test=False):

        self.input_file = input_file
        self.lowercase = lowercase
        self.lemming = lemming
        self.sw_filter = sw_filter
        self.device = device
        self.encoded_data = []
        self.class_label = self.get_class_labels()
        self.vocab = vocab
        self.test = test
        self.pos_vocab = pos_vocab
        self.data = self.load_jsonl(self.input_file)
        self.init_structures(self.data, max_len)

    def init_structures(self, sentences, max_len) -> None:
        for d in sentences:
            data = self.getwords(d)
            words_idx, pos_idx = self.index_words(data, max_len)


            if not self.test:
                label_idx = self.class_label.get(d['label'], 15)
            else:
                label_idx = 15
            if len(words_idx) == 0: continue

            words_idx = torch.tensor(words_idx)
            words_idx = nn.ConstantPad1d((0, max(0, max_len - words_idx.shape[0])), self.vocab["<PAD>"])(words_idx)


            pos_idx = torch.tensor(pos_idx)
            pos_idx = nn.ConstantPad1d((0, max(0, max_len - pos_idx.shape[0])), self.pos_vocab["<PAD>"])(pos_idx)

            self.encoded_data.append(({'word_indx': words_idx, 'pos_indx': pos_idx}, label_idx, d['id']))

    def getwords(self, data):
        """
          Returns the words (cleaned and filtered) of the sentences.
          id, lemma, pos, label
        """
        lemmatizer = WordNetLemmatizer()
        s = data['text'].lower()

        s = re.sub('[:;!@#$()\-&<>/,.]', '', s)

        s = nltk.word_tokenize(s)
        cachedStopWords = stopwords.words("english")
        s = nltk.pos_tag(s, tagset='universal')
        out_s = []

        for word, pos in s:
            if self.sw_filter and word in cachedStopWords: continue

            if self.lemming: out_s.append((lemmatizer.lemmatize(word, pos=self.get_wordnet_pos(pos)), pos))

            else: out_s.append((word, pos))
        return {'words': out_s}

    def index_words(self, data: Dict, max_len: int):
        idxs = []
        pos_idxs = []
        words = data['words']
        for i, (word, pos) in enumerate(words):
            if len(idxs) == max_len: break
            if word in self.vocab.keys():
                idxs.append(self.vocab[word])
            else:
                idxs.append(self.vocab["<UNK>"])
            pos_idxs.append(self.pos_vocab.get(pos, self.pos_vocab.get('<UNK>')))
        return idxs, pos_idxs

    @staticmethod
    def decode_class(index):
        for key, val in WCDataset.get_class_labels.items():
            if val == index:
                return key

    @staticmethod
    def get_class_labels()-> Dict:
        return {"business": 0, "crime": 1, "culture/arts": 2, "education": 3, "entertainment": 4,
                "environment": 5, "food/drink": 6, "home/living": 7, "media": 8, "politics": 9, "religion": 10,
                "sci/tech": 11, "sports": 12, "wellness": 13, "world": 14}

    @staticmethod
    def get_class_from_index(x: int):
        for c, ci in WCDataset.get_class_labels().items():
            if WCDataset.get_class_labels()[c] == x: return c
        return ""

    @staticmethod
    def load_jsonl(input_path):
        data = []
        with open(input_path, 'r', encoding='utf-8') as f:
            for line in f:
                data.append(json.loads(line.rstrip('\n|\r')))
        print('Loaded {} records from {}'.format(len(data), input_path))
        return data

    @staticmethod
    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('ADJ'):
            return wordnet.ADJ
        elif treebank_tag.startswith('VERB'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('ADV'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def __len__(self):
        # returns the number of samples in our dataset
        return len(self.encoded_data)

    def getData(self):
        return self.encoded_data

    def __getitem__(self, idx):
        return self.encoded_data[idx]

    def collate_fn(self, data):

        words = torch.stack([e[0]["word_indx"] for e in data])
        posses = torch.stack([e[0]["pos_indx"] for e in data])

        y = torch.tensor([e[1] for e in data])

        ids = torch.tensor([e[2] for e in data])
        return (words, posses), y, ids

In [None]:
def create_pos_embs(emb_size):
    pos_list = ["ADJ", "ADP", "PUNCT", "ADV", "AUX", "SYM", "INTJ", "CCONJ", "X", "NOUN", "DET", "PROPN", "NUM", "VERB",
                "PART", "PRON", "SCONJ", "<UNK>", "<PAD>"]
    pos_embs = []
    pos_index = dict()
    for seed, vocab in enumerate(pos_list):
        np.random.seed(seed)
        pos_embs.append(np.random.rand(emb_size))
        pos_index[vocab] = seed
    return np.array(pos_embs, dtype=float), pos_index

In [None]:
class WCParams:
    device = "cuda" if torch.cuda.is_available() else "cpu"

    lemming = True
    sw_filter = True
    max_sentence_len = 75

    word_vocab_name = gensim_vocab_name
    word_vocab = loaded_word_vocab

    pos_vocab_embs, pos_vocab_index = create_pos_embs(10)

    word_vocab_size = len(word_vocab)
    word_embedding_dim = word_vocab.vector_size
    pos_embedding_dim = len(pos_vocab_embs[0])

    word_vocab["<UNK>"] = np.zeros(word_embedding_dim)
    word_vocab["<PAD>"] = np.zeros(word_embedding_dim)

    num_classes = 15
    hidden_dims = [word_embedding_dim, num_classes]

    epochs = 20
    batch_size = 64
    learning_rate = 1e-3 #default 0.001
    weight_decay = 1e-4
    dropout = 0.15

    csv = ",".join(
        ["device", "lemming", "sw_filter", "max_sentence_len", "word_vocab", "word_vocab_size", "word_embedding_dim", "pos_embedding_dim",
         "num_classes", "hidden_dims", "epochs", "batch_size", "learning_rate", "weight_decay", "dropout"])
    csv += "\n"
    csv += ",".join(
        [str(device), str(lemming), str(sw_filter), str(max_sentence_len), str(word_vocab_name), str(word_vocab_size), str(word_embedding_dim),
         str(pos_embedding_dim), str(num_classes), str(hidden_dims), str(epochs), str(batch_size), str(learning_rate),
         str(weight_decay), str(dropout)])

# Model

In [None]:
class WCModel(nn.Module):
    def __init__(self, params: WCParams):
        super(WCModel, self).__init__()
        self.hidden_dim = params.hidden_dims
        self.params = params

        self.word_embedding = nn.Embedding.from_pretrained(torch.FloatTensor(self.params.word_vocab.vectors),
                                                           freeze=False)
        self.pos_embedding = nn.Embedding.from_pretrained(torch.FloatTensor(self.params.pos_vocab_embs),
                                                           freeze=False)
        self.layers = []
        for i in range(len(self.hidden_dim) - 1):
            self.layers += [
                nn.BatchNorm1d(self.hidden_dim[i]),
                nn.Linear(self.hidden_dim[i], self.hidden_dim[i + 1]),
                nn.Dropout(params.dropout),
                nn.LeakyReLU(),
            ]
        self.layers.append(nn.Softmax(dim=1))
        self.sequential = nn.Sequential(*self.layers)

    def forward(self, x):
        word_out = self.word_embedding(x[0])

        #words mean
        sentence_emb = torch.mean(word_out, 1)

        #sequential
        out = self.sequential(sentence_emb)
        return out

    def predict(self, x):
        self.eval()

        logits = self.forward(x)

        preds = logits.argmax(1)

        return preds

#Def Train-Eval functions

In [None]:
def train(model, train_dataloader, optimizer, criterion, device):
    total = 0
    correct = 0
    epoch_loss = 0

    model.train()

    for (x, y, sentence_id) in train_dataloader:
        x = (x[0].to(device), x[1].to(device))
        y = y.to(device)

        optimizer.zero_grad()

        y_pred = model(x)

        loss = criterion(y_pred, y)

        top = y_pred.argmax(1)
        correct += torch.sum(top == y).item()
        total  += y.size(0)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()


    return epoch_loss / len(train_dataloader),  correct / total

def evaluate(model, iterator, criterion, device):
    correct = 0
    total = 0
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for (x, y, sentence_id) in iterator:
            x = (x[0].to(device), x[1].to(device))
            y = y.to(device)

            y_pred = model(x)

            loss = criterion(y_pred, y)

            top = y_pred.argmax(1)
            correct += torch.sum(top == y).item()
            total  += y.size(0)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator), correct/total

# Init Params Dataloaders and Model

In [None]:
## Init Model
params = WCParams()

In [None]:
train_dataset = WCDataset(train_path, params.word_vocab.key_to_index, params.pos_vocab_index, lemming=params.lemming,
                          sw_filter=params.sw_filter, max_len=params.max_sentence_len, device=params.device)
train_dataloader = DataLoader(train_dataset, batch_size=params.batch_size, collate_fn=train_dataset.collate_fn, shuffle=True)

Loaded 186282 records from .\data\train.jsonl


In [None]:
dev_dataset = WCDataset(dev_path, params.word_vocab.key_to_index, params.pos_vocab_index, lemming=params.lemming,
                        sw_filter=params.sw_filter, max_len=params.max_sentence_len, device=params.device)
dev_dataloader = DataLoader(dev_dataset, batch_size=params.batch_size, collate_fn=dev_dataset.collate_fn)

Loaded 6844 records from .\data\dev.jsonl


In [None]:
model = WCModel(params).to(params.device)

optimizer = torch.optim.Adam(model.parameters(), lr=params.learning_rate, weight_decay=params.weight_decay)
criterion = nn.CrossEntropyLoss()

# Train



In [None]:
best_valid_loss = float('inf')
losses = {"train": [], "val": []}
accuracies = {"train": [], "val": []}

for epoch in range(params.epochs):

    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion, params.device)
    valid_loss, valid_acc = evaluate(model, dev_dataloader, criterion, params.device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), os.path.join(temp_path,'model.ckpt'))

    losses["train"].append(train_loss)
    losses["val"].append(valid_loss)

    accuracies["train"].append(train_acc)
    accuracies["val"].append(valid_acc)

    print(" #### EPOCH {} ####".format(str(epoch + 1)))
    print(f'\tTrain Loss: {train_loss:.3f} | Train Err: {100 - (train_acc * 100):.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Err: {100 - (valid_acc * 100):.2f}%')

## Plot train graph

In [None]:
plt.plot(losses["train"], label="train")
plt.plot(losses["val"], label="val")
plt.legend()
plt.show()

In [None]:
plt.plot(accuracies["train"],label="train" )

plt.plot(accuracies["val"], label="val")
plt.legend()

plt.show()

Save parameters

In [None]:

out_file = os.path.join(temp_path, "model_params.csv")
with open(out_file, "w") as f:
    f.write(params.csv)

# Prediction Section

In [None]:
model_p= "."
prediction_path= os.path.join(model_p, "predictions")
model.load_state_dict(torch.load(os.path.join(model_p,'model.ckpt')))

<All keys matched successfully>

Init test Dataloader

In [None]:
test_dataset = WCDataset(test_path, params.word_vocab.key_to_index, params.pos_vocab_index, lemming=params.lemming,
                         sw_filter=params.sw_filter, max_len=params.max_sentence_len, device=params.device, test=True)
test_dataloader = DataLoader(test_dataset, batch_size=params.batch_size, collate_fn=dev_dataset.collate_fn)

Loaded 6849 records from .\data\test.jsonl


Generate *predictions_dev.tsv* and *predictions_test.tsv*

In [None]:
predict_path = os.path.join(os.path.join(prediction_path, "predictions_dev.tsv"))
with open(predict_path, "w") as f:
    txt = ""
    with torch.no_grad():
        for (x, _, sentence_id) in dev_dataloader:
            x = (x[0].to(params.device), x[1].to(params.device))
            y_pred = model.predict(x)
            for index, sid in enumerate(sentence_id):
                txt += "{}\t{}\n".format(str(int(sid)),WCDataset.get_class_from_index(y_pred[index]))
    f.write(txt)

In [None]:
predict_path = os.path.join(os.path.join(prediction_path, "predictions_test.tsv"))
with open(predict_path, "w") as f:
    txt = ""
    with torch.no_grad():
        for (x, _, sentence_id) in test_dataloader:
            x = (x[0].to(params.device), x[1].to(params.device))
            y_pred = model.predict(x)
            for index, sid in enumerate(sentence_id):
                txt += "{}\t{}\n".format(str(int(sid)),WCDataset.get_class_from_index(y_pred[index]))
    f.write(txt)

Confirm evaluation with scorer.py

In [None]:
! python3 scorer.py --prediction_file ./predictions/predict_dev.tsv --gold_file ./gold/gold_dev.tsv

{'err_rate': '23.55'}
