# HW2

In [69]:
from gensim import downloader

vec_num = 100
GLOVE_PATH = f'glove-twitter-{vec_num}'
glove_twitter = downloader.load(GLOVE_PATH)




## Model 1: GloVe

In [70]:
import numpy as np

def open_and_split_file(file_path):
    with open(file_path) as f:
        lines = f.readlines()
        words_str = []
        words = []
        tags = []
        for line in lines:
            try:
                word, tag = line.rstrip().split("\t")
                word = word.lower()
                if (word not in glove_twitter):
                    words.append(np.zeros(vec_num))
                else:
                    words.append(glove_twitter[word])
                tags.append(0 if tag == "O" else 1)
                words_str.append(word)

            except:
                continue
    return words, tags

In [71]:
train_words, train_labels = open_and_split_file("/home/student/hw2/NER_task_in_NLP/data/train.tagged")

In [72]:
dev_words, dev_labels = open_and_split_file("/home/student/hw2/NER_task_in_NLP/data/dev.tagged")

In [73]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(train_words, train_labels)
y_pred = knn.predict(dev_words)

from sklearn.metrics import f1_score
f1_score(dev_labels, y_pred)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.5422804146208401

## Model 2: Feed Forward

In [74]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import Dataset
from torch.optim import Adam
from sklearn.metrics import f1_score

In [75]:
class CustomDataset(Dataset):
    def __init__(self, path):
        # Create a path-to-label dictionary
        with open(path) as f:
            lines = f.readlines()
            words = []
            tags = []
            for line in lines:
                try:
                    word, tag = line.rstrip().split("\t")
                    word = word.lower()
                    if (word not in glove_twitter):
                        words.append(np.zeros(vec_num))
                    else:
                        words.append(glove_twitter[word])
                    tags.append(0 if tag == "O" else 1)

                except:
                    continue
        self.words = words
        self.tags = tags

    def __len__(self):
        return len(self.words)

    def __getitem__(self, index):
        word = self.words[index]
        tag = self.tags[index]
        word = torch.FloatTensor(word).squeeze()
        data = {"word": word, "labels": tag}
        return data

In [76]:
train_dataset = CustomDataset(path="/home/student/hw2/NER_task_in_NLP/data/train.tagged")
dev_dataset = CustomDataset(path="/home/student/hw2/NER_task_in_NLP/data/dev.tagged")
datasets = {"train": train_dataset, "dev": dev_dataset}

In [77]:
class FeedForwardNN(nn.Module):
    def __init__(self, vec_dim, num_classes, hidden_dim=100):
        super(FeedForwardNN, self).__init__()
        self.first_layer = nn.Linear(vec_dim, hidden_dim)
        self.second_layer = nn.Linear(hidden_dim, num_classes)
        self.activation = nn.ReLU()
        self.loss = nn.CrossEntropyLoss()

    def forward(self, word, labels=None):
        x = self.first_layer(word)
        x = self.activation(x)
        x = self.second_layer(x)
        if labels is None:
            return x, None
        loss = self.loss(x, labels)
        return x, loss

In [78]:
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader

def train(model, data_sets, optimizer, num_epochs: int, batch_size=16):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data_loaders = {"train": DataLoader(data_sets["train"], batch_size=batch_size, shuffle=True),
                    "dev": DataLoader(data_sets["dev"], batch_size=batch_size, shuffle=False)}
    model.to(device)

    best_acc = 0.0

    for epoch in range(num_epochs):
        model.train()

        for batch in data_loaders['train']:
            batch_size = 0
            for k, v in batch.items():
                batch[k] = v.to(device)
                batch_size = v.shape[0]

            optimizer.zero_grad()
            _, loss = model(**batch)
            loss.backward()  # The important part
            optimizer.step()
                
    # Now use the dev dataset to evaluate the model.
    model.eval()
    predictions = torch.tensor([])
    tags = torch.tensor([])
    for batch in data_loaders['dev']:
        batch_size = 0
        for k, v in batch.items():
            batch[k] = v.to(device)
            batch_size = v.shape[0]

        optimizer.zero_grad()    
        with torch.no_grad():
            outputs, _ = model(**batch) 
            pred = outputs.argmax(dim=-1).clone().detach().cpu()
            predictions = torch.cat((predictions, pred), 0)
        tags = torch.cat((tags, (batch["labels"].clone().detach().cpu())), 0)

    score = f1_score(tags, predictions)
    print(f'F1 score: {score}')
                
    # with open('model.pkl', 'rb') as f:
    #     model = torch.load(f)
    return model

In [79]:
# Hyper-parameters that we can change:
# hidden_dim: the dimension of the hidden layer
# num_epochs: the number of epochs to train the model
# learning_rate: the learning rate of the optimizer (Adam) - find out more about it in the documentation.
model = FeedForwardNN(vec_num, 2, hidden_dim=int(vec_num*2))
optimizer = Adam(params=model.parameters())
model = train(model=model, data_sets=datasets, optimizer=optimizer, num_epochs=15)
print()

F1 score: 0.5759109311740891



## Model 3: LSTM

In [80]:
from torch.autograd import Variable
X_train_tensors = Variable(torch.Tensor(np.array(train_words)))
X_dev_tensors = Variable(torch.Tensor(np.array(dev_words)))

y_train_tensors = Variable(torch.Tensor(np.array(train_labels)))
y_dev_tensors = Variable(torch.Tensor(np.array(dev_labels)))

In [81]:
X_train_tensors = torch.reshape(X_train_tensors, (X_train_tensors.shape[0], 1, X_train_tensors.shape[1]))
X_dev_tensors = torch.reshape(X_dev_tensors, (X_dev_tensors.shape[0], 1, X_dev_tensors.shape[1]))

In [82]:
''' This code was inspired by the following source: https://cnvrg.io/pytorch-lstm/
    Because we thought it was more suitable for our uses than the one studied in class. '''
        
class LSTM(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, hidden2_size, num_stacked_layers, seq_length):
        super(LSTM, self).__init__()
        self.num_classes = num_classes  # We have 2 classes, binary.
        self.input_size = input_size  # The number of expected features in the input x.
        self.hidden_size = hidden_size  # number of features in hidden state.
        self.num_stacked_layers = num_stacked_layers
        self.seq_length = seq_length  # Number of words in each timestamp.

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)  # lstm
        self.layer1 =  nn.Linear(hidden_size, hidden2_size)  # Layer 1 in the LSTM
        self.layer2 = nn.Linear(hidden2_size, num_classes)  # Layer 2 in the LSTM

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.loss = nn.CrossEntropyLoss()

    
    def forward(self, word, labels=None):
        word = word.unsqueeze(1)
        h_0 = Variable(torch.zeros(self.num_stacked_layers, word.size(0), self.hidden_size)).to(self.device)  # Short term memory.
        c_0 = Variable(torch.zeros(self.num_stacked_layers, word.size(0), self.hidden_size)).to(self.device)  # Long term memory.
        
        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(word, (h_0, c_0))  # Perform lstm with relation to input, hidden, and internal state
        hn = hn.view(-1, self.hidden_size)  # Reshaping the data for Dense layer next
        out = self.relu(hn)
        out = self.layer1(out)  # First Dense
        out = self.relu(out)  # Activation function - Relu
        out = self.layer2(out)  # Second layer
        out = self.softmax(out) # Activation function - Softmax
        # pred = outputs.argmax(dim=-1).clone().detach().cpu()
        if labels is None:
            return out, None
        loss = self.loss(out, labels)
        return out, loss

In [93]:
num_epochs = 15
learning_rate = 0.001

num_stacked_layers = 1  # Number of stacked lstm layers, in this model, we do not stack layers.
num_classes = 2  # Number of output classes

lstm = LSTM(num_classes, vec_num, int(vec_num/2), vec_num*2, num_stacked_layers, X_train_tensors.shape[1])  # Initiate the model

In [94]:
cross_entropy = nn.CrossEntropyLoss()  # Cross entropy loss
optimizer = Adam(lstm.parameters(), lr=learning_rate)  # Adam optimizer

In [95]:
lstm = train(model=lstm, data_sets=datasets, optimizer=optimizer, num_epochs=15)
print()

  out = self.softmax(out) # Activation function - Softmax


F1 score: 0.574353448275862



## Model 4: Competition

In [126]:
import numpy as np

def open_split_file_and_calc_weights(file_path):
    with open(file_path) as f:
        lines = f.readlines()
        words_str = []
        words = []
        tags = []
        word2idx = {"null" : 0}
        weights = [np.zeros(vec_num)]
        for line in lines:
            try:
                vector_added = False
                word, tag = line.rstrip().split("\t")
                word = word.lower()
                words_str.append(word)
                tags.append(0 if tag == "O" else 1)
            except:
                continue

        for w_idx, word in enumerate(words_str):
            if word not in word2idx.keys():
                vector_added = True
                word2idx[word] = len(word2idx.keys())

            if (word not in glove_twitter):
                stemmed_words = list()
                # try stemming
                for i in range(min(len(word), 5)):
                    if word[i:] in glove_twitter:
                        stemmed_words.append(glove_twitter[word[i:]])
                    for j in range(1, min(len(word), 5)):
                        if word[i:-j] in glove_twitter:
                            stemmed_words.append(glove_twitter[word[i:-j]])
                average_on_stemmed = np.mean(np.array(stemmed_words), axis=0) if stemmed_words else np.zeros(vec_num)

                window_words = list()
                for i in range(1, 3):
                    if len(words) > i:
                        window_words.append(words[-i])
                    if w_idx < len(words_str) - i:
                        if words_str[w_idx + i] in glove_twitter:
                            window_words.append(glove_twitter[words_str[w_idx + i]])
                average_on_window = np.mean(np.array(window_words), axis=0) if window_words else np.zeros(vec_num)
        
                words.append(np.mean(np.array([average_on_stemmed, average_on_window]), axis=0))
            else:
                words.append(glove_twitter[word])

            if vector_added:
                weights.append(words[-1])
    return words, tags, torch.from_numpy(np.array(weights)), word2idx

In [127]:
def open_and_split_dev_test_file(file_path, word2idx, tags=True):
    with open(file_path) as f:
        lines = f.readlines()
        words = []
        tags = []
        for line in lines:
            try:
                if tags:
                    word, tag = line.rstrip().split("\t")
                else:
                    word = line.rstrip()
                    if word == '':
                        raise
                
                word = word.lower()
                if word in word2idx.keys():
                    words.append(word2idx[word])
                else:
                    temp_word = None
                    for j in range(1, min(len(word), 5)):
                        if temp_word:
                            break
                        if word[:-j] in word2idx.keys():
                            temp_word = word[:-j]
                            break
                        for i in range(min(len(word), 5)):
                            if word[i:-j] in word2idx.keys():
                                temp_word = word[i:-j]
                                break
                    words.append(word2idx[temp_word] if temp_word else word2idx["null"])

                tags.append(0 if tag == "O" else 1)

            except:
                continue
    return words, tags

In [128]:
train_words, train_labels, weights_matrix, train_word2idx = open_split_file_and_calc_weights("/home/student/hw2/NER_task_in_NLP/data/train.tagged")
dev_words, dev_labels = open_and_split_dev_test_file("/home/student/hw2/NER_task_in_NLP/data/dev.tagged", train_word2idx)

In [129]:
# first we want to create a lstm which will generate new vector representations for the words
class LSTM_Vectorizer(nn.Module):
    def __init__(self, num_classes, weights_matrix, input_size, hidden_size, hidden2_size, num_stacked_layers):
        super(LSTM_Vectorizer, self).__init__()
        self.num_classes = num_classes  # We have 2 classes, binary.
        self.input_size = input_size  # The number of expected features in the input x.
        self.hidden_size = hidden_size  # number of features in hidden state.
        self.num_stacked_layers = num_stacked_layers

        num_embeddings, embedding_dim = weights_matrix.size()  # TODO: Enter the new and improved glove matrix.
        self.vocab_size = num_embeddings
        self.emb_layer = nn.Embedding(num_embeddings, embedding_dim)
        self.emb_layer.load_state_dict({'weight': weights_matrix})

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)  # lstm
        self.layer1 =  nn.Linear(hidden_size, hidden2_size)  # Layer 1 in the LSTM
        self.layer2 = nn.Linear(hidden2_size, num_classes)  # Layer 2 in the LSTM

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.loss = nn.CrossEntropyLoss()

    
    def forward(self, word, labels=None):
        word = self.emb_layer(word)
        h_0 = Variable(torch.zeros(self.num_stacked_layers, word.size(0), self.hidden_size)).to(self.device)  # Short term memory.
        c_0 = Variable(torch.zeros(self.num_stacked_layers, word.size(0), self.hidden_size)).to(self.device)  # Long term memory.
        
        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(word, (h_0, c_0))  # Perform lstm with relation to input, hidden, and internal state
        hn = hn.view(-1, self.hidden_size)  # Reshaping the data for Dense layer next
        out = self.relu(hn)
        out = self.layer1(out)  # First Dense
        out = self.relu(out)  # Activation function - Relu
        out = self.layer2(out)  # Second layer
        out = self.softmax(out) # Activation function - Softmax
        # pred = outputs.argmax(dim=-1).clone().detach().cpu()
        if labels is None:
            return out, None
        loss = self.loss(out, labels)
        return out, loss

In [130]:
class CompetativeDataset(Dataset):
    def __init__(self, words, labels):
        # Create a path-to-label dictionary
        self.words = words
        self.tags = labels

    def __len__(self):
        return len(self.words)

    def __getitem__(self, index):
        word = self.words[index]
        tag = self.tags[index]
        word = torch.FloatTensor(word).squeeze()
        data = {"word": word, "labels": tag}
        return data

In [135]:
train_dataset = CompetativeDataset(words=train_words, labels=train_labels)
dev_dataset = CompetativeDataset(words=dev_words, labels=dev_labels)
datasets = {"train": train_dataset, "dev": dev_dataset}

In [132]:
num_epochs = 15
learning_rate = 0.001

num_stacked_layers = 1  # Number of stacked lstm layers, in this model, we do not stack layers.
num_classes = 2  # Number of output classes

lstm_model = LSTM_Vectorizer(num_classes, weights_matrix, vec_num, int(vec_num/2), vec_num*2, num_stacked_layers)  # Initiate the model

In [133]:
cross_entropy = nn.CrossEntropyLoss()  # Cross entropy loss
optimizer = Adam(lstm.parameters(), lr=learning_rate)  # Adam optimizer

In [134]:
lstm = train(model=lstm, data_sets=datasets, optimizer=optimizer, num_epochs=15)
print()

  out = self.softmax(out) # Activation function - Softmax


IndexError: list index out of range

## Old

In [86]:
def create_glove_vector_old(word, data):
    temp_word = False
    if word not in glove_twitter.key_to_index:
        # print(f"{word} not an existing word in the model")
        # if you dont have this word - just skip it
        # return False
        
        if word.startswith("http"):
            # all links in train data are tagged O
            data.append(np.zeros(vec_dim))
            return
        
        try:
            # check if word is a number
            float(word)
            data.append(np.zeros(vec_dim))
            return
        except:
            pass
    
        # try stemming
        for i in range(min(len(word), 5)):
            if word[i:] in glove_twitter.key_to_index:
                temp_word = word[i:]
                break
            for j in range(1, min(len(word), 5)):
                if word[i:-j] in glove_twitter.key_to_index:
                    temp_word = word[i:-j]
                    break

    else:
        temp_word = word


    if temp_word:
        vec = glove_twitter[temp_word]
        data.append(vec)

    else:
        data.append(np.zeros(vec_dim))
        # print(word)
    

In [87]:
with open("/home/student/hw2/NER_task_in_NLP/data/train.tagged") as f:
    lines = f.readlines()
    words_count = 0
    prev_word = ''
    for i, line in enumerate(lines):
        try:
            word, tag = line.rstrip().split("\t")
            words_count += 1
            if  (word.startswith('http')):
                # print(tag)
                if tag != 'O': #and lines[i+1].rstrip().split('\t')[0] == ':':
                    print("yes")
                    print(word)
            #if line[i-1] != "\t\n" and "@" in word:#  \
            #and line[i-1].rstrip().split("\t")[0] == "RT" and line[i+1].rstrip().split("\t")[0] == ":":
                # print(line[i-1])
                #pass

        except:
            # print(line.rstrip().split("\t"))
            # # print(line)
            # print([line])
            # ! the line is blank representing \t\n
            # TODO thingy for end of sentence
            prev_word = ''
            continue
        prev_word = word