# Sample Code for Testing Saved Model
This file provides a sample to test the saved model. Make necessary changes so that we can test your CNN/RNN model with this file. If you developed a RNN model, change the name of this file to *test_rnn*.

## Load test data
In the sample below, it loads the dev set for testing. But in real marking, the markers will load held-out test set.

In [1]:
# read data
import pandas as pd

dev_data = pd.read_csv('cw2_dev.csv')
dev_data

Unnamed: 0.1,Unnamed: 0,Sent1,Sent2,SimScore
0,0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,1.00
1,1,A young child is riding a horse.,A child is riding a horse.,0.95
2,2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,1.00
3,3,A woman is playing the guitar.,A man is playing guitar.,0.48
4,4,A woman is playing the flute.,A man is playing a flute.,0.55
...,...,...,...,...
2995,2995,"The professor introduced the artists , and the...",The professor introduced the artists .,0.62
2996,2996,The doctors supported the judges .,The doctors supported the tourists and the jud...,0.68
2997,2997,The secretary knew the manager .,The secretary knew the manager danced .,0.37
2998,2998,The professors next to the president recommend...,The president recommended the professors .,0.29


## Load Embeddings
Clearly specify the embeddings your implementation requires. Also provide the link for downloading the embeddings. 

In [2]:
# load pre-trained glove embeddings
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np

embd_name = 'glove.6B.300d'
link_to_embd = 'https://nlp.stanford.edu/projects/glove/' # TODO: you should provide the link to download the embedding here

# Below is a sample to load the glove embeddings. ADJUST the code according to the
# embedding you want to use. 
word_vec_dim = 300
path_of_downloaded_files = "/Users/user/Desktop/NLP/glove.6B.{}d.txt".format(word_vec_dim)
glove_file = datapath(path_of_downloaded_files)
word2vec_glove_file = get_tmpfile("glove.6B.300d.txt")
glove2word2vec(glove_file, word2vec_glove_file)
word_vectors = KeyedVectors.load_word2vec_format(word2vec_glove_file)



  glove2word2vec(glove_file, word2vec_glove_file)


## Provide Functions Needed for Evaluation
All functions used to run and evaluate your model should be provided. 

In [3]:
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize

def get_sent_word_vecs(word_vectors, sent_words, largest_len):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    for i in range(largest_len-len(sent_words)):
        vecs.append([0.]*word_vec_dim)
    return np.array(np.transpose(vecs))

def build_mini_batch(sent_list, word_vectors):
    tokenized_sents = [word_tokenize(ss.lower()) for ss in sent_list]
    largest_len = np.max([len(tokens) for tokens in tokenized_sents])
    text_vecs = []
    for ts in tokenized_sents:
        vv = get_sent_word_vecs(word_vectors, ts, largest_len)
        text_vecs.append(vv)
    # print('mini batch shape',np.array(text_vecs).shape)
    return np.array(text_vecs)

def evaluate_trained_model(trained_model, dev_data):
    dev_docs1 = dev_data['Sent1']
    dev_docs2 = dev_data['Sent2']
    dev_labels = dev_data['SimScore']
    
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval()
        dev_predictions = []
        for idx in range(0,len(dev_data),batch_size):
            x_data1 = build_mini_batch(dev_docs1[idx:idx+batch_size], word_vectors)
            x_data2 = build_mini_batch(dev_docs2[idx:idx+batch_size], word_vectors)
            if x_data1.shape[0] == 0: continue # to avoid empty batch
            elif x_data2.shape[0] == 0: continue
            x_tensor1 = torch.tensor(x_data1, dtype=torch.float)
            x_tensor2 = torch.tensor(x_data2, dtype=torch.float)

            y_pred1 = model(x_tensor1).cpu().detach()#.numpy()
            y_pred2 = model(x_tensor2).cpu().detach()#.numpy()

            cos_sim = nn.CosineSimilarity()
            pred_labels = cos_sim(y_pred1, y_pred2)
            pred_labels = pred_labels.squeeze().tolist()

            dev_predictions += pred_labels
    
    assert len(dev_labels) == len(dev_predictions)    
    squared_errors = [np.square(ts-ps) for (ts, ps) in zip(dev_labels, dev_predictions)]
    print('MSE of the method on the dev set:', np.mean(squared_errors))

## Provide Your Model
You should provide the implementaiton of your encoder model below. 

In [4]:
import torch
import torch.nn as nn
import numpy as np

class CNN(nn.Module):
    def __init__(self, embd_dim, filter_size_list, filter_num_list, out_dim, dp_rate=0.5):
        super(CNN, self).__init__()
        self.embd_dim = embd_dim
        assert len(filter_size_list) == len(filter_num_list)
        self.output_dim = out_dim
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dp_rate)
        self.fc = nn.Linear(np.sum(filter_num_list), out_dim)
        self.convs = self.build_convs(filter_size_list, filter_num_list)
        
    def build_convs(self, f_sizes, f_nums):
        convs = nn.ModuleList()
        for fs, fn in zip(f_sizes, f_nums):
            padding_size = fs-1
            m = nn.Conv1d(self.embd_dim, fn, fs, padding=padding_size)
            convs.append(m)
        return convs
    
    def get_conv_output(self, input_matrix, conv):
        # step 1: compute convolution
        assert input_matrix.shape[1] == self.embd_dim
        conv_output = conv(input_matrix)
        # step 2: pass through an activation function
        conv_relu = self.relu(conv_output)
        # step 3: max-over-time pooling
        maxp = nn.MaxPool1d(conv_relu.shape[2])
        maxp_output = maxp(conv_relu)
        return maxp_output
       
    def forward(self, all_text_vectors):
        cnn_repr = torch.tensor([])
        for cv in self.convs:
            cv_output = self.get_conv_output(all_text_vectors, cv)
            cnn_repr = torch.cat((cnn_repr, cv_output), dim=1)
        # print(cnn_repr.shape)
        after_dp = self.dropout(cnn_repr.squeeze())
        logit = self.fc(after_dp)
        # the CrossEntropyLoss provided by pytorch includes softmax; so you do not need to include a softmax layer in your net
        return logit

## Run and Evaluate Model
The code below creates an instance of the model, loads the saved weights (sample_model.state_dict; run cw2_sample.ipynb will generate this file), and tests it.

In [11]:
import pickle

# load the saved file
with open('best_cnn.state_dict','rb') as ff:
    saved_info = pickle.load(ff)
    
# extract the information from the saved file
oov_vec = saved_info['oov_vec']
saved_model_state = saved_info['model_state_dict']

# create model, load saved weights, and test the model
filter_sizes = [2,3,4]
filter_nums = [100]*len(filter_sizes)
dropout_rate = 0
batch_size = 50
model = CNN(word_vec_dim, filter_sizes, filter_nums, word_vec_dim, dropout_rate)
model.load_state_dict(saved_model_state) 
evaluate_trained_model(model, dev_data)

MSE of the method on the dev set: 0.03424460890371057
