# HW2

In [1]:
from gensim import downloader

vec_num = 50
GLOVE_PATH = f'glove-twitter-{vec_num}'
glove_twitter = downloader.load(GLOVE_PATH)


## Model 1: GloVe

In [20]:
import numpy as np

def open_and_split_file(file_path):
    with open(file_path) as f:
        lines = f.readlines()
        words_str = []
        words = []
        tags = []
        for line in lines:
            try:
                word, tag = line.rstrip().split("\t")
                word = word.lower()
                if (word not in glove_twitter):
                    words.append(np.zeros(vec_num))
                else:
                    words.append(glove_twitter[word])
                tags.append(0 if tag == "O" else 1)
                words_str.append(word)

            except:
                continue
    return words, tags, words_str

In [68]:
train_words, train_labels, train_words_str = open_and_split_file("/home/student/hw2/NER_task_in_NLP/data/train.tagged")

In [69]:
dev_words, dev_labels, dev_words_str = open_and_split_file("/home/student/hw2/NER_task_in_NLP/data/dev.tagged")

In [70]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(train_words, train_labels)
y_pred = knn.predict(dev_words)

from sklearn.metrics import f1_score
f1_score(dev_labels, y_pred)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.5860995850622407

## Model 2: Feed Forward

In [6]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import Dataset
from torch.optim import Adam
from sklearn.metrics import f1_score

In [7]:
class CustomDataset(Dataset):
    def __init__(self, path):
        # Create a path-to-label dictionary
        with open(path) as f:
            lines = f.readlines()
            words = []
            tags = []
            for line in lines:
                try:
                    word, tag = line.rstrip().split("\t")
                    word = word.lower()
                    if (word not in glove_twitter):
                        words.append(np.zeros(vec_num))
                    else:
                        words.append(glove_twitter[word])
                    tags.append(0 if tag == "O" else 1)

                except:
                    continue
        self.words = words
        self.tags = tags

    def __len__(self):
        return len(self.words)

    def __getitem__(self, index):
        word = self.words[index]
        tag = self.tags[index]
        word = torch.FloatTensor(word).squeeze()
        data = {"word": word, "labels": tag}
        return data

In [8]:
train_dataset = CustomDataset(path="/home/student/hw2/NER_task_in_NLP/data/train.tagged")
dev_dataset = CustomDataset(path="/home/student/hw2/NER_task_in_NLP/data/dev.tagged")
datasets = {"train": train_dataset, "dev": dev_dataset}

In [9]:
class FeedForwardNN(nn.Module):
    def __init__(self, vec_dim, num_classes, hidden_dim=100):
        super(FeedForwardNN, self).__init__()
        self.first_layer = nn.Linear(vec_dim, hidden_dim)
        self.second_layer = nn.Linear(hidden_dim, num_classes)
        self.activation = nn.ReLU()
        self.loss = nn.CrossEntropyLoss()

    def forward(self, word, labels=None):
        x = self.first_layer(word)
        x = self.activation(x)
        x = self.second_layer(x)
        if labels is None:
            return x, None
        loss = self.loss(x, labels)
        return x, loss

In [82]:
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader

def train(model, data_sets, optimizer, num_epochs: int, batch_size=16):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data_loaders = {"train": DataLoader(data_sets["train"], batch_size=batch_size, shuffle=True),
                    "dev": DataLoader(data_sets["dev"], batch_size=batch_size, shuffle=False)}
    model.to(device)

    best_acc = 0.0

    for epoch in range(num_epochs):
        model.train()

        for batch in data_loaders['train']:
            batch_size = 0
            for k, v in batch.items():
                batch[k] = v.to(device)
                batch_size = v.shape[0]

            optimizer.zero_grad()
            _, loss = model(**batch)
            loss.backward()  # The important part
            optimizer.step()
                
    # Now use the dev dataset to evaluate the model.
    model.eval()
    predictions = torch.tensor([])
    tags = torch.tensor([])
    for batch in data_loaders['dev']:
        batch_size = 0
        for k, v in batch.items():
            batch[k] = v.to(device)
            batch_size = v.shape[0]

        optimizer.zero_grad()    
        with torch.no_grad():
            outputs, _ = model(**batch) 
            pred = outputs.argmax(dim=-1).clone().detach().cpu()
            predictions = torch.cat((predictions, pred), 0)
        tags = torch.cat((tags, (batch["labels"].clone().detach().cpu())), 0)

    score = f1_score(tags, predictions)
    print(f'F1 score: {score}')
                
    # with open('model.pkl', 'rb') as f:
    #     model = torch.load(f)
    return model

In [11]:
# Hyper-parameters that we can change:
# hidden_dim: the dimension of the hidden layer
# num_epochs: the number of epochs to train the model
# learning_rate: the learning rate of the optimizer (Adam) - find out more about it in the documentation.
model = FeedForwardNN(vec_num, 2, hidden_dim=int(vec_num*2))
optimizer = Adam(params=model.parameters())
model = train(model=model, data_sets=datasets, optimizer=optimizer, num_epochs=15)
print()

  word = torch.FloatTensor(word).squeeze()


F1 score: 0.5751898734177215



## Model 3: LSTM

In [71]:
from torch.autograd import Variable
X_train_tensors = Variable(torch.Tensor(np.array(train_words)))
X_dev_tensors = Variable(torch.Tensor(np.array(dev_words)))

y_train_tensors = Variable(torch.Tensor(np.array(train_labels)))
y_dev_tensors = Variable(torch.Tensor(np.array(dev_labels)))

In [72]:
X_train_tensors = torch.reshape(X_train_tensors, (X_train_tensors.shape[0], 1, X_train_tensors.shape[1]))
X_dev_tensors = torch.reshape(X_dev_tensors, (X_dev_tensors.shape[0], 1, X_dev_tensors.shape[1]))

In [115]:
class LSTM(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, hidden2_size, num_stacked_layers, seq_length):
        super(LSTM, self).__init__()
        self.num_classes = num_classes  # We have 2 classes, binary.
        self.input_size = input_size  # The number of expected features in the input x.
        self.hidden_size = hidden_size  # number of features in hidden state.
        self.num_stacked_layers = num_stacked_layers
        self.seq_length = seq_length  # Number of words in each timestamp.

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)  # lstm
        self.layer1 =  nn.Linear(hidden_size, hidden2_size)  # Layer 1 in the LSTM
        self.layer2 = nn.Linear(hidden2_size, num_classes)  # Layer 2 in the LSTM

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.loss = nn.CrossEntropyLoss()

    
    def forward(self, word, labels=None):
        print(word.size())
        word = word.unsqueeze(0)
        h_0 = Variable(torch.zeros(self.num_stacked_layers, word.size(0), self.hidden_size)).to(device)  # Short term memory.
        c_0 = Variable(torch.zeros(self.num_stacked_layers, word.size(0), self.hidden_size)).to(device)  # Long term memory.
        
        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(word, (h_0, c_0))  # Perform lstm with relation to input, hidden, and internal state
        hn = hn.view(-1, self.hidden_size)  # Reshaping the data for Dense layer next
        out = self.relu(hn)
        out = self.layer1(out)  # First Dense
        out = self.relu(out)  # Activation function - Relu
        out = self.layer2(out)  # Second layer
        out = self.softmax(out) # Activation function - Softmax
        # pred = outputs.argmax(dim=-1).clone().detach().cpu()
        if labels is None:
            return out, None
        loss = self.loss(out, labels)
        return out, loss

In [116]:
num_epochs = 15
learning_rate = 0.001

num_stacked_layers = 1  # Number of stacked lstm layers, in this model, we do not stack layers.
num_classes = 2  # Number of output classes

model = LSTM(num_classes, vec_num, int(vec_num/2), vec_num*2, num_stacked_layers, X_train_tensors.shape[1])  # Initiate the model

In [117]:
cross_entropy = nn.CrossEntropyLoss()  # Cross entropy loss
optimizer = Adam(model.parameters(), lr=learning_rate)  # Adam optimizer

In [121]:
model = train(model=model, data_sets=datasets, optimizer=optimizer, num_epochs=15)
print()

In [None]:
# Training the model
for epoch in range(num_epochs):
    outputs = model.forward(X_train_tensors)  # Executing forward pass
    optimizer.zero_grad()  # Caluclate the gradient, manually setting to 0
    
    # Obtain the loss function
    loss = cross_entropy(outputs, y_train_tensors)
    loss.backward()  # Calculates the value of the loss function
    
    optimizer.step()  # Improve from loss, i.e backpropagation.
    print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))

model.eval()
predictions = torch.tensor([])
tags = torch.tensor([])
for batch in data_loaders['dev']:
    batch_size = 0
    for k, v in batch.items():
        batch[k] = v.to(device)
        batch_size = v.shape[0]

    optimizer.zero_grad()    
    with torch.no_grad():
        outputs, _ = model(**batch) 
        pred = outputs.argmax(dim=-1).clone().detach().cpu()
        predictions = torch.cat((predictions, pred), 0)
    tags = torch.cat((tags, (batch["labels"].clone().detach().cpu())), 0)

score = f1_score(tags, predictions)
print(f'F1 score: {score}')

#### Old

In [32]:
# Initialize the embedding matrix according to the glove from earlier.
embedding_mat = np.array(train_words)
embedding_mat = torch.tensor(embedding_mat)

In [33]:
class MyNet(nn.Module):
    def __init__(self, embedding_mat, embedding_dim, hidden_dim=50, tag_dim=2):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.word_embedding = nn.Embedding.from_pretrained(embedding_mat, freeze=True)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, batch_first=True)
        self.hidden2tag = nn.Sequential(nn.ReLU(),
                                        nn.Linear(self.hidden_dim, tag_dim))
        self.loss_fn = nn.NLLLoss()
    

    def forward(self, word, sentence_len, tags=None):
        embeds = self.word_embedding(word)  # TODO: Check if this is the correct way to use the embedding
        lstm_out, _ = self.lstm(embeds.view(1, -1, self.embedding_dim))
        tag_space = self.hidden2tag(lstm_out[range(1), sentence_len - 1, :])
        tag_score = F.softmax(tag_space, dim=1)
        if tags is None:
            return tag_score, None
        loss = self.loss_fn(tag_score, tags)
        return tag_score, loss

In [34]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len), dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, :len(review)] = np.array(review)[:seq_len]
    return features

In [35]:
class ReviewsDataSet(Dataset):
    def __init__(self, sentences, sentences_lens, y):
        self.X = sentences
        self.X_lens = sentences_lens
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, item):
        return self.X[item], self.X_lens[item], self.y[item]

In [59]:
from tqdm import tqdm
def train(model, device, optimizer, train_dataset, val_dataset):
    accuracies = []
    for phase in ["train", "validation"]:
        if phase == "train":
            model.train(True)
        else:
            model.train(False) #or model.evel()
        correct = 0.0
        count = 0
        accuracy = None
        dataset = train_dataset if phase == "train" else val_dataset
        t_bar = tqdm(dataset)
        for sentence, lens, tags in t_bar:
            sentence = sentence.type(torch.IntTensor)
            lens = lens.type(torch.FloatTensor)
            tags = tags.type(torch.FloatTensor)
            if phase == "train":
                tag_scores, loss = model(sentence.to(device), lens.to(device), tags.to(device))
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            else:
                with torch.no_grad():
                    tag_scores, _ = model(sentence.to(device), lens.to(device), tags.to(device))
            correct += (tag_scores.argmax(1).to("cpu") == tags).sum()
            count += len(tags)
            accuracy = correct/count
            t_bar.set_description(f"{phase} accuracy: {accuracy:.2f}")
        accuracies += [accuracy]
    return accuracies

In [55]:
def tokenize(x_train, x_val):
    word2idx = {"[PAD]": 0, "[UNK]": 1}
    idx2word = ["[PAD]", "[UNK]"]
    for sent in x_train:
        for word in sent.split():
            if word not in word2idx:
                word2idx[word] = len(word2idx)
                idx2word.append(word)

    final_list_train, final_list_test = [], []
    for sent in x_train:
        final_list_train.append([word2idx[word] for word in sent.split()])
    for sent in x_val:
        final_list_test.append([word2idx[word] if word in word2idx else word2idx['[UNK]'] for word in sent.split()])
    return final_list_train, final_list_test, word2idx, idx2word

In [60]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device:", device)

# train_data = pd.Datafram({"word" : train_words, "tag": train_labels})
# test_data = pd.Datafram({"word" : dev_words, "tag": dev_labels})
x_train, y_train = train_words_str, train_labels# train_data["word"].values, train_data["tag"].values
x_test, y_test = dev_words_str, dev_labels# test_data["word"].values, test_data["tag"].values
n_classes = max(y_test) + 1

# x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y)
x_train, x_test, word2idx, idx2word = tokenize(x_train, x_test)
vocab_size = len(word2idx)
train_sentence_lens = [min(len(s), 500) for s in x_train]
test_sentence_lens = [min(len(s), 500) for s in x_test]

x_train_pad = padding_(x_train, 500)
x_test_pad = padding_(x_test, 500)

print(x_train_pad.shape, x_test_pad.shape)

train_dataset = ReviewsDataSet(x_train_pad, train_sentence_lens, y_train)
test_dataset = ReviewsDataSet(x_test_pad, test_sentence_lens, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=64)
test_dataloader = DataLoader(test_dataset, batch_size=64)

model = MyNet(embedding_mat, vec_num, tag_dim=n_classes)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=8e-3)

best_accuracy = 0
best_epoch = None
for epoch in range(1000):
    print(f"\n -- Epoch {epoch} --")
    train_accuracy, val_accuracy = train(model, device, optimizer, train_dataloader, test_dataloader)
    if val_accuracy>best_accuracy:
        best_accuracy = val_accuracy
        best_epoch = epoch
    if epoch - best_epoch == 3:
        break
print(f"best accuracy: {best_accuracy:.2f} in epoch {best_epoch}")

device: cuda
(62730, 500) (15733, 500)

 -- Epoch 0 --


  0%|          | 0/981 [00:00<?, ?it/s]


RuntimeError: Input and parameter tensors are not the same dtype, found input tensor with Double and parameter tensor with Float

## Old code

In [None]:
def create_glove_vector_old(word, data):
    temp_word = False
    if word not in glove_twitter.key_to_index:
        # print(f"{word} not an existing word in the model")
        # if you dont have this word - just skip it
        # return False
        
        if word.startswith("http"):
            # all links in train data are tagged O
            data.append(np.zeros(vec_dim))
            return
        
        try:
            # check if word is a number
            float(word)
            data.append(np.zeros(vec_dim))
            return
        except:
            pass
    
        # try stemming
        for i in range(min(len(word), 5)):
            if word[i:] in glove_twitter.key_to_index:
                temp_word = word[i:]
                break
            for j in range(1, min(len(word), 5)):
                if word[i:-j] in glove_twitter.key_to_index:
                    temp_word = word[i:-j]
                    break

    else:
        temp_word = word


    if temp_word:
        vec = glove_twitter[temp_word]
        data.append(vec)

    else:
        data.append(np.zeros(vec_dim))
        # print(word)
    

In [None]:
with open("/home/student/hw2/NER_task_in_NLP/data/train.tagged") as f:
    lines = f.readlines()
    words_count = 0
    prev_word = ''
    for i, line in enumerate(lines):
        try:
            word, tag = line.rstrip().split("\t")
            words_count += 1
            if  (word.startswith('http')):
                # print(tag)
                if tag != 'O': #and lines[i+1].rstrip().split('\t')[0] == ':':
                    print("yes")
                    print(word)
            #if line[i-1] != "\t\n" and "@" in word:#  \
            #and line[i-1].rstrip().split("\t")[0] == "RT" and line[i+1].rstrip().split("\t")[0] == ":":
                # print(line[i-1])
                #pass

        except:
            # print(line.rstrip().split("\t"))
            # # print(line)
            # print([line])
            # ! the line is blank representing \t\n
            # TODO thingy for end of sentence
            prev_word = ''
            continue
        prev_word = word