## [COM4513-6513] Assignment 2: Text Classification with a Feedforward Network


In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
from time import localtime, strftime
from scipy.stats import spearmanr,pearsonr
import zipfile
import gc

# fixing random seed for reproducibility
random.seed(123)
np.random.seed(123)


## Transform Raw texts into training and development data

#### 1. load data

In [2]:
train_data=pd.read_csv('./data_topic/train.csv',header=None,names=["label","text"])
dev_data=pd.read_csv('./data_topic/dev.csv',header=None,names=["label","text"])
test_data=pd.read_csv('./data_topic/test.csv',header=None,names=["label","text"])
train_data.head()

Unnamed: 0,label,text
0,1,Reuters - Venezuelans turned out early\and in ...
1,1,Reuters - South Korean police used water canno...
2,1,Reuters - Thousands of Palestinian\prisoners i...
3,1,AFP - Sporadic gunfire and shelling took place...
4,1,AP - Dozens of Rwandan soldiers flew into Suda...


#### 2. Make the raw texts into lists and their corresponding labels into  np.arrays:

In [3]:
def creat_list_array(data_text,data_label):
    x=data_text.tolist()
    y=np.array(data_label)
    return x,y
def lower_F(data):
    lower_list=[]
    for i in range(len(data)):
        lower_data=str.lower(data[i])
        lower_list.append(lower_data)
    return lower_list

# Transform train data
data_train_x_raw,data_train_y=creat_list_array(train_data['text'],train_data['label']) #len 2400
# Transform validation data
data_dev_x_raw,data_dev_y=creat_list_array(dev_data['text'],dev_data['label']) #len 150
# Transform test data
data_test_x_raw,data_test_y=creat_list_array(test_data['text'],test_data['label']) #len 900

# lower data
data_train_x=lower_F(data_train_x_raw)
data_dev_x=lower_F(data_dev_x_raw)
data_test_x=lower_F(data_test_x_raw)


In [5]:
stop_words = ['a','in','on','at','and','or', 
              'to', 'the', 'of', 'an', 'by', 
              'as', 'is', 'was', 'were', 'been', 'be', 
              'are','for', 'this', 'that', 'these', 'those', 'you', 'i', 'if',
             'it', 'he', 'she', 'we', 'they', 'will', 'have', 'has',
              'do', 'did', 'can', 'could', 'who', 'which', 'what',
              'but', 'not', 'there', 'no', 'does', 'not', 'so', 've', 'their',
             'his', 'her', 'they', 'them', 'from', 'with', 'its']

# tokenise, create unigrams, using stop-words
def tokenise(data,token_pattern,stop_words):
    token_data=[]
    token_list=re.findall(token_pattern,data)
    for word in token_list:
        if word not in stop_words:
            token_data.append(word)
    return token_data

# based on the tokenised data(unigrams), create bigrams or trigrams
def ngrams_generate(data,n):
    result_list=[]
    ngrams = zip(*[data[i:] for i in range(n)])
    for ngram in ngrams:
        result_list.append((ngram))
    return result_list

# extract ngrams function
def extract_ngrams(x_raw,ngram_range=(1,3),token_pattern=r'\b[A-Za-z][A-Za-z]+\b',stop_words=[],vocab=set()):
    # tokenise data
    token_data=tokenise(x_raw,token_pattern=token_pattern,stop_words=stop_words)
    # create ngrams list which save ngrams result
    result_ngrams=[]
    result_vocab=[]
    # Extract ngrams based on the ngram_range
    if ngram_range == 1:
        result_ngrams = token_data
    elif ngram_range[0]==1:
        result_ngrams=token_data
        for i in range(ngram_range[0],ngram_range[1]):
            ngrams=ngrams_generate(token_data,i+1)
            result_ngrams=result_ngrams+ngrams
    else:
        result_ngrams=ngrams_generate(token_data,ngram_range[0])
        for i in range(ngram_range[0],ngram_range[1]):
            ngrams=ngrams_generate(token_data,ngram_range[0]+1)
            result_ngrams=result_ngrams+ngrams
    # Extract specific vocab based on the vocab set()
    if len(vocab)==0:
        return result_ngrams
    else:
        for word in vocab:
            if word in result_ngrams:
                result_vocab.append(word)
        return result_vocab
    
# Extract ngrams on the complete data set, for dev and test sets
def extract_ngrams_for_test(X_data,ngram_range,stop_words=stop_words):
    # Extract ngrams from raw data
    ngrams_list_without_Ded=[]
    for i in range(len(X_data)):
        ngrams_data=extract_ngrams(X_data[i],ngram_range=ngram_range,stop_words=stop_words)
        ngrams_list_without_Ded.append(ngrams_data)
    return ngrams_list_without_Ded

In [6]:
def get_vocab(X_raw, ngram_range=(1,3), token_pattern=r'\b[A-Za-z][A-Za-z]+\b', min_df=0, keep_topN=0, stop_words=[]):
    ngrams_list = []
    ngrams_list_without_Ded = []
    for i in range(len(X_raw)):
        ngrams_data=extract_ngrams(X_raw[i],ngram_range=ngram_range,stop_words=stop_words)
        ngrams_list_without_Ded.append(ngrams_data)
        # Deduplication
        ngrams_data_Ded=sorted(set(ngrams_data),key=ngrams_data.index)
        ngrams_list.append(ngrams_data_Ded)
    
    # create vocab dictionary
    vocab_dict = {}
    for i in range(len(ngrams_list)):
        for word in ngrams_list[i]:
            if word in vocab_dict:
                vocab_dict[word]+=1
            else:
                vocab_dict[word]=1
                
    # keep ngrams with a minimun df
    for word in list(vocab_dict.keys()):
        if vocab_dict[word] < min_df:
            del vocab_dict[word]
    
    # sorted then keep only topN
    vocab_sorted=sorted(vocab_dict.items(),key=lambda item:item[1],reverse=True)
    if keep_topN == 0:
        vocab_topN = vocab_sorted
    else:
        vocab_topN = vocab_sorted[:keep_topN]
        
    vocab = []
    for i in range(len(vocab_topN)):
        vocab.append(vocab_topN[i][0])
        
    return vocab,vocab_topN,ngrams_list_without_Ded

In [None]:
# get vocab

In [7]:
# extract vocab, df——train, train token data
vocab_all,df_tr,ngrams_without_Ded_tr=get_vocab(data_train_x,ngram_range=(1),  
                                            min_df=0,keep_topN=0, stop_words=stop_words)
# extract dev token
ngrams_without_Ded_dev=extract_ngrams_for_test(data_dev_x,ngram_range=(1),stop_words=stop_words)
# extract test token
ngrams_without_Ded_test=extract_ngrams_for_test(data_test_x,ngram_range=(1),stop_words=stop_words)

In [8]:
vocab = vocab_all[:3000]

In [9]:
print(len(vocab))
print()
print(random.sample(vocab,100))
print()
print(df_tr[:10])

3000

['report', 'system', 'led', 'appealed', 'starts', 'republic', 'costs', 'brian', 'narrower', 'blamed', 'kuwait', 'october', 'official', 'credit', 'vijay', 'drew', 'lanka', 'numbers', 'jimmy', 'retail', 'congolese', 'ap', 'outfielder', 'missed', 'wondering', 'kosuke', 'press', 'one', 'believes', 'missiles', 'protect', 'began', 'because', 'opposition', 'fireworks', 'failed', 'outlook', 'minister', 'oakland', 'looks', 'gatumba', 'ditch', 'finished', 'backpacker', 'former', 'violent', 'increasing', 'bundled', 'buyers', 'hearing', 'previously', 'medley', 'invitational', 'decades', 'blow', 'percent', 'boy', 'customer', 'successive', 'leg', 'consortium', 'gunfire', 'dark', 'often', 'medics', 'occupied', 'owen', 'delhi', 'plotting', 'champions', 'remaining', 'iraqi', 'finances', 'blair', 'geneva', 'conference', 'karzai', 'raw', 'surging', 'forecast', 'remote', 'law', 'homeless', 'wake', 'trial', 'imported', 'billions', 'protesters', 'arabia', 'prescription', 'shares', 'republican', 'atlan

In [10]:
def create_dict(vocab):
    id_word_dict = {}
    word_id_dict = {}
    for i in range(len(vocab)):
        id_word_dict[i] = vocab[i]
        word_id_dict[vocab[i]] = i
    return id_word_dict,word_id_dict
id_word_dict,word_id_dict = create_dict(vocab)

In [None]:
# convert index

In [11]:
def create_index(data,word_id_dict):
    X_uni_tr = data
    X_tr = []
    for i in range(len(X_uni_tr)):
        list_a = []
        for word in X_uni_tr[i]:
            if word in word_id_dict:
                word_id = word_id_dict[word]
            else:
                pass
            list_a.append(word_id)
        X_tr.append(list_a)
    return X_uni_tr,X_tr
# represent train set
X_uni_tr,X_tr = create_index(ngrams_without_Ded_tr,word_id_dict)

In [12]:
X_uni_tr[0]

['reuters',
 'rwandan',
 'troops',
 'airlifted',
 'sunday',
 'sudan',
 'darfur',
 'first',
 'foreign',
 'force',
 'mandated',
 'protect',
 'observers',
 'monitoring',
 'cease',
 'fire',
 'between',
 'sudanese',
 'government',
 'rebels',
 'troubled',
 'western',
 'region']

In [13]:
X_tr[0]

[0,
 1011,
 758,
 28,
 208,
 1103,
 1367,
 29,
 308,
 816,
 262,
 1586,
 2704,
 108,
 759,
 35,
 172,
 175,
 493,
 701,
 97,
 4,
 1221,
 2203,
 173,
 10,
 63]

In [None]:
# transform dev and test dataset

In [14]:
X_uni_dev,X_dev = create_index(ngrams_without_Ded_dev,word_id_dict)
X_uni_test,X_test = create_index(ngrams_without_Ded_test,word_id_dict)

In [None]:
# network weight

In [15]:
def network_weights(vocab_size=1000, embedding_dim=300, 
                    hidden_dim=[], num_classes=3, init_val = 0.5):
    dict_num_list = hidden_dim
    dict_num_list.insert(0,vocab_size)
    dict_num_list.insert(1,embedding_dim)
    dict_num_list.append(num_classes)
    W={}
    for i in range(len(dict_num_list)):
        if i == len(dict_num_list)-1:
            break
        else:
            np.random.seed(2020)
            W[i] = np.random.uniform(-init_val,init_val,(dict_num_list[i],dict_num_list[i+1])).astype('float32')
    return W
    

In [None]:
# example 1

In [16]:
W = network_weights(vocab_size=5,embedding_dim=10,hidden_dim=[], num_classes=2)

print('W_emb:', W[0].shape)
print('W_out:', W[1].shape)

W_emb: (5, 10)
W_out: (10, 2)


In [None]:
# example 2

In [2]:
W = network_weights(vocab_size=3,embedding_dim=4,hidden_dim=[2], num_classes=2)
                    
print('W_emb:', W[0].shape)
print('W_h1:', W[1].shape)
print('W_out:', W[2].shape)

NameError: name 'network_weights' is not defined

In [None]:
# softmax

In [19]:
def softmax(z):
    sig = (np.exp(z).T/np.sum(np.exp(z),axis=1)).T
    return sig

In [None]:
# compute loss

In [20]:
def categorical_loss(y, y_preds):
    loss = -np.log(y_preds[y])
    #l2_regularization = (alpha/2)*(np.sum(np.square(weights)))
    return loss

In [21]:
# example for 5 classes

y = 2 #true label
y_preds = softmax(np.array([[-2.1,1.,0.9,-1.3,1.5]]))[0]

print('y_preds: ',y_preds)
print('loss:', categorical_loss(y, y_preds))

y_preds:  [0.01217919 0.27035308 0.24462558 0.02710529 0.44573687]
loss: 1.40802648485675


In [None]:
# relu and relu_derivative

In [22]:
#def relu(z):
#    return z*(z>0)

def relu(z):
    return np.maximum(z, 0)

def relu_derivative(z):
    return (z>0)*1

In [None]:
# drop out

In [23]:
def dropout_mask(size, dropout_rate):
    dropout_vec = np.ones(size)
    num = int(size*dropout_rate)
    dropout_vec[:num] = 0
    np.random.shuffle(dropout_vec)
    return dropout_vec

In [103]:
print(dropout_mask(10, 0.2))
print(dropout_mask(10, 0.3))

[1. 1. 1. 1. 1. 1. 0. 0. 1. 1.]
[1. 1. 1. 0. 1. 1. 0. 1. 0. 1.]


In [None]:
#forward pass

In [25]:
def forward_pass(x, W, dropout_rate=0.2):
    out_vals = {}
    h_vecs = []
    a_vecs = []
    dropout_vecs = []
    
    x_vecs = [W[0][x_num] for x_num in x]
    # h0 = 1/x * sum.(list_vec)
    h0 = np.expand_dims(1/len(x)*np.sum(x_vecs,axis = 0).T,axis = 0) # (,4) -> (1,4)
    a0 = relu(h0)
    d0 = dropout_mask(a0.shape[1],dropout_rate)
    #output_0 = a0*d0
    output_0 = (a0*d0)/dropout_rate
    
    # add h, a, dropout array to list
    h_vecs.append(h0.squeeze())
    a_vecs.append(a0.squeeze())
    dropout_vecs.append(d0.squeeze())
    
    if len(W) == 2:
        y = softmax(output_0@W[1])
    else:
        output = output_0
        for i in range(len(W)):
            h = output@W[i+1]
            a = relu(h)
            d = dropout_mask(a.shape[1],dropout_rate)
            #output = a*d
            output = (a*d)/dropout_rate
            # add h, a, dropout array to list
            h_vecs.append(h.squeeze())
            a_vecs.append(a.squeeze())
            dropout_vecs.append(d.squeeze())
            
            if i == len(W)-3:
                break
        y = softmax(output@W[len(W)-1])
    
    # output result to dictionary
    out_vals['h'] = h_vecs
    out_vals['a'] = a_vecs
    out_vals['dropout_vecs'] = dropout_vecs
    out_vals['y'] = y.squeeze()
   
    return out_vals
    

In [None]:
# example

In [26]:
W = network_weights(vocab_size=3,embedding_dim=4,hidden_dim=[5], num_classes=2)
 
for i in range(len(W)):
    print('Shape W'+str(i), W[i].shape)

print()
print(forward_pass([2,1], W, dropout_rate=0.5))

Shape W0 (3, 4)
Shape W1 (4, 5)
Shape W2 (5, 2)

{'h': [array([ 0.09953883, -0.31317306, -0.29131782,  0.05019794], dtype=float32), array([ 0.01674634, -0.02840193,  0.00616702, -0.0377309 , -0.01809771])], 'a': [array([0.09953883, 0.        , 0.        , 0.05019794], dtype=float32), array([0.01674634, 0.        , 0.00616702, 0.        , 0.        ])], 'dropout_vecs': [array([0., 0., 1., 1.]), array([0., 1., 1., 1., 0.])], 'y': array([0.50036991, 0.49963009])}


In [None]:
# one hot x and y

In [28]:
def one_hot_x(x,vocab_len):
    result = []
    for i in range(vocab_len):
        if i in x:
            a = 1
            result.append(a)
        else:
            a = 0
            result.append(a)
    return result

def one_hot_y(y,class_num):
    a=np.eye(class_num+1)[y]
    return np.delete(a,0,axis=0)

In [None]:
# backward pass

In [29]:
# 1 
def backward_pass(x, y, W, out_vals, dropout_rate, lr=0.001, freeze_emb=False):
    W_new = W.copy()
    W_num = len(W)
    
    W_first = W[0].shape
    vocab_size = W_first[0]
    
    W_last = W[list(W.keys())[-1]].shape
    class_num = W_last[1]
    
    x_onehot = one_hot_x(x,vocab_size)
    y_onehot = one_hot_y(y,class_num)
    p = out_vals['y']
    
    # first
    g = np.expand_dims(p - y,axis = 0)
    out = np.expand_dims(out_vals['a'][W_num-2] * out_vals['dropout_vecs'][W_num-2],axis = 0)
    dw = out.T @ g # [5,2]
    W_new[W_num-1] = W_new[W_num-1] - lr*dw
    d_out = ((g @ W[W_num-1].T) * out_vals['dropout_vecs'][W_num-2])/dropout_rate
    #d_out = (g @ W[W_num-1].T) * out_vals['dropout_vecs'][W_num-2]
    
    if W_num == 2:
        if freeze_emb == False:
            g0 = d_out * relu_derivative(out_vals['h'][0])
            dw0 = np.expand_dims(np.array(x_onehot),axis = 1) @ g0
            W_new[0] = W_new[0] - lr*dw0

        elif freeze_emb == True:
            pass
    else:
        for i in range(W_num-2):
            g = d_out * relu_derivative(out_vals['h'][W_num-2-i])
            out = np.expand_dims(out_vals['a'][W_num-3-i] * out_vals['dropout_vecs'][W_num-3-i],axis = 0)
            dw = out.T @ g
            W_new[W_num-2-i] = W_new[W_num-2-i] - lr*dw
            d_out = ((g @ W[W_num-2-i].T)*out_vals['dropout_vecs'][W_num-3-i])/dropout_rate
            #d_out = (g @ W[W_num-2-i].T)*out_vals['dropout_vecs'][W_num-3-i]
            
        if freeze_emb == False:
            g0 = d_out * relu_derivative(out_vals['h'][0])
            dw0 = np.expand_dims(np.array(x_onehot),axis = 1) @ g0
            W_new[0] = W_new[0] - lr*dw0

        elif freeze_emb == True:
            pass
        
    return W_new




In [85]:
# 2
def backward_pass(x, y, W, out_vals, dropout_rate, lr=0.001, freeze_emb=False):
    W_num = len(W)
    W_first = W[0].shape
    vocab_size = W_first[0]
    
    W_last = W[list(W.keys())[-1]].shape
    class_num = W_last[1]
    
    x_onehot = one_hot_x(x,vocab_size)
    y_onehot = one_hot_y(y,class_num)
    p = out_vals['y']
    
    # first
    g = np.expand_dims(p - y,axis = 0)
    out = np.expand_dims(out_vals['a'][W_num-2] * out_vals['dropout_vecs'][W_num-2],axis = 0)
    dw = out.T @ np.float32(g) # [5,2]
    W[W_num-1] = W[W_num-1] - lr*dw
    d_out = ((g @ W[W_num-1].T) * out_vals['dropout_vecs'][W_num-2])/dropout_rate
    #d_out = (g @ W[W_num-1].T) * out_vals['dropout_vecs'][W_num-2]
    
    if W_num == 2:
        if freeze_emb == False:
            g0 = d_out * relu_derivative(out_vals['h'][0])
            dw0 = np.expand_dims(np.array(x_onehot),axis = 1) @ g0
            W[0] = W[0] - lr*dw0

        elif freeze_emb == True:
            pass
    else:
        for i in range(W_num-2):
            g = d_out * relu_derivative(out_vals['h'][W_num-2-i])
            out = np.expand_dims(out_vals['a'][W_num-3-i] * out_vals['dropout_vecs'][W_num-3-i],axis = 0)
            dw = out.T @ g
            W[W_num-2-i] = W[W_num-2-i] - lr*dw
            d_out = ((g @ W[W_num-2-i].T)*out_vals['dropout_vecs'][W_num-3-i])/dropout_rate
            #d_out = (g @ W[W_num-2-i].T)*out_vals['dropout_vecs'][W_num-3-i]
            
        if freeze_emb == False:
            g0 = d_out * relu_derivative(out_vals['h'][0])
            dw0 = np.expand_dims(np.array(x_onehot),axis = 1) @ g0
            W[0] = W[0] - lr*dw0

        elif freeze_emb == True:
            pass
        
    return W

In [91]:
# 3
def backward_pass(x, y, W, out_vals,lr=0.001, freeze_emb=False):
    W_num = len(W)
    
    #y_array = np.zeros(W[list(W.keys())[-1]].shape[-1])
    y_array = np.zeros([W[W_num-1].shape[1]])
    y_array[y-1] = 1
    temp = out_vals['a'][-1]*out_vals['dropout_vecs'][-1]
    current_loss = np.float32((out_vals['y'] - y_array))
    last_layer_gradient = np.dot(temp.reshape([W[W_num-1].shape[0],1]),current_loss.reshape([1,W[W_num-1].shape[1]]))
    W[W_num-1] = W[W_num-1] - lr*last_layer_gradient
    current_loss = np.dot(W[W_num-1],current_loss).reshape([W[W_num-1].shape[0],1])
    layercount = len(W)-1
    
    while layercount>1 and freeze_emb ==False:
        current_loss = current_loss*relu_derivative(out_vals['h'][layercount-1]).reshape([W[layercount].shape[0],1])
        W_grad = np.dot((out_vals['a'][layercount-2]*out_vals['dropout_vecs'][layercount-2]).reshape([W[layercount-1].shape[0],1]),W[layercount-2])
        W[layercount-1] = W[layercount-1] - lr*W_grad
        current_loss = np.dot(W[layercount-1],current_loss).reshape([W[layercount-1].shape[0],1])
        layercount = layercount-1
    if freeze_emb ==False:
        x_array = np.zeros([W[0].shape[0],1])
        x_array[x] = 1
        current_loss = current_loss*relu_derivative(out_vals['h'][0]).reshape([W[0].shape[1],1])
        W_grad = np.dot(x_array,current_loss.T)
        W[0] = W[0] - lr*W_grad
    return W
        
        
        
        

In [None]:
#SGD

In [30]:
def compute_loss(data, y_true, W, dropout_rate):
    loss_result = 0.0
    for i in range(len(data)):
        x = data[i]
        prob_dict = forward_pass(x, W, dropout_rate)
        prob_pred = prob_dict['y']
        loss = categorical_loss(y_true[i]-1,prob_pred)
        loss_result += loss
        
    return loss_result/(len(data))

def randmoise_data(data,label):
    index=[i for i in range(len(data))]
    data_array = np.array(data)
    random.shuffle(index)
    data1=data_array[index]
    label1=label[index]
    return data1,label1

In [86]:
def SGD(X_tr, Y_tr, W, X_dev=[], Y_dev=[], lr=0.001, 
        dropout=0.2, epochs=5, tolerance=0.001, freeze_emb=False, print_progress=True):
    
    cur_loss_tr = 1.
    cur_loss_dev = 1.
    training_loss_history = []
    validation_loss_history = []
    
    W_curr = W
    for i in range(epochs):
        cur_loss_dev = compute_loss(X_dev,Y_dev,W_curr,dropout_rate=dropout)
        if print_progress==True:
            validation_loss_history.append(cur_loss_dev) 
            
        #shuffle train dataset
        X_tr_ran,Y_tr_ran = randmoise_data(X_tr,Y_tr)
        for n in range(len(X_tr_ran)):
            out_vals = forward_pass(X_tr_ran[n], W_curr, dropout_rate=dropout)
            W_curr = backward_pass(X_tr_ran[n], Y_tr_ran[n], W_curr, out_vals,dropout_rate=dropout,lr=lr,freeze_emb=freeze_emb)
            
        cur_loss_tr = compute_loss(X_tr_ran, Y_tr_ran, W_curr, dropout_rate=dropout)
        if print_progress==True:
            training_loss_history.append(cur_loss_tr) 
            
        print('Epoch:{0}|Training Loss{1}|Validation Loss{2}'.format(i,cur_loss_tr,cur_loss_dev))
        
        # if diff smaller than tolerance then break iteration
        if i==0:
            pass
        else:
            #diff = previous validation loss − current validation loss
            diff=validation_loss_history[-2]-cur_loss_dev
            if abs(diff)<tolerance:
                break
    return W_curr, training_loss_history, validation_loss_history




In [92]:
# 2
def SGD(X_tr, Y_tr, W, X_dev=[], Y_dev=[], lr=0.001, 
        dropout=0.2, epochs=5, tolerance=0.001, freeze_emb=False, print_progress=True):
    
    cur_loss_tr = 1.
    cur_loss_dev = 1.
    training_loss_history = []
    validation_loss_history = []
    
    W_curr = W
    for i in range(epochs):
        cur_loss_dev = compute_loss(X_dev,Y_dev,W_curr,dropout_rate=dropout)
        if print_progress==True:
            validation_loss_history.append(cur_loss_dev) 
            
        #shuffle train dataset
        X_tr_ran,Y_tr_ran = randmoise_data(X_tr,Y_tr)
        for n in range(len(X_tr_ran)):
            out_vals = forward_pass(X_tr_ran[n], W_curr, dropout_rate=dropout)
            W_curr = backward_pass(X_tr_ran[n], Y_tr_ran[n], W_curr, out_vals,lr=lr,freeze_emb=freeze_emb)
            
        cur_loss_tr = compute_loss(X_tr_ran, Y_tr_ran, W_curr, dropout_rate=dropout)
        if print_progress==True:
            training_loss_history.append(cur_loss_tr) 
            
        print('Epoch:{0}|Training Loss{1}|Validation Loss{2}'.format(i,cur_loss_tr,cur_loss_dev))
        
        # if diff smaller than tolerance then break iteration
        if i==0:
            pass
        else:
            #diff = previous validation loss − current validation loss
            diff=validation_loss_history[-2]-cur_loss_dev
            if abs(diff)<tolerance:
                break
    return W_curr, training_loss_history, validation_loss_history

In [94]:
X = X_tr[:2000]
Y = data_train_y[:2000]
X_d = X_dev[:20]
Y_d = data_dev_y[0:20]
W = network_weights(vocab_size=len(vocab),embedding_dim=100,hidden_dim=[], num_classes=3)

W_1, training_loss_history, validation_loss_history = SGD(X, Y, W, X_d,Y_d,lr=0.001,\
                                                        dropout=0.2, epochs=30, tolerance=0.0001, freeze_emb=False, print_progress=True)

Epoch:0|Training Loss1.092803670097112|Validation Loss1.2372955300402892
Epoch:1|Training Loss1.023120550919523|Validation Loss1.0463838767055296
Epoch:2|Training Loss0.9572358353754123|Validation Loss0.9753945066596079
Epoch:3|Training Loss0.9014522964248384|Validation Loss1.0380198981911153
Epoch:4|Training Loss0.8518479853579349|Validation Loss0.9625816109792835
Epoch:5|Training Loss0.8199724129508498|Validation Loss0.928956299699012
Epoch:6|Training Loss0.7786143492002672|Validation Loss0.8306734188246467
Epoch:7|Training Loss0.7597428097309898|Validation Loss0.834575478181543
Epoch:8|Training Loss0.7259993507176548|Validation Loss0.8012532578251269
Epoch:9|Training Loss0.6920639036467784|Validation Loss0.8093084969666376
Epoch:10|Training Loss0.6758387704606004|Validation Loss0.7010034529768256
Epoch:11|Training Loss0.6522882392637098|Validation Loss0.6497622190511457
Epoch:12|Training Loss0.6263531233808146|Validation Loss0.680780040507636
Epoch:13|Training Loss0.6118465197824523

In [98]:
W = network_weights(vocab_size=len(vocab),embedding_dim=100,hidden_dim=[], num_classes=3)

W_1, training_loss_history, validation_loss_history=SGD(X_tr, data_train_y, W, X_dev,data_dev_y,lr=0.001, 
        dropout=0.3, epochs=30, tolerance=0.001, freeze_emb=False, print_progress=True)

Epoch:0|Training Loss1.0795401952919148|Validation Loss1.1982577008409576
Epoch:1|Training Loss1.0176519518334153|Validation Loss1.1155264588119949
Epoch:2|Training Loss0.9679001260696922|Validation Loss1.1119778865634027
Epoch:3|Training Loss0.9189293604277594|Validation Loss1.0759394717601192
Epoch:4|Training Loss0.8748979904090465|Validation Loss1.0375396082431818
Epoch:5|Training Loss0.8489015709036999|Validation Loss1.009369252884199
Epoch:6|Training Loss0.809540582485053|Validation Loss0.9699869347801869
Epoch:7|Training Loss0.7726917124228524|Validation Loss0.9736472342373607
Epoch:8|Training Loss0.7462820452891763|Validation Loss0.9744335607976861


Now you are ready to train and evaluate you neural net. First, you need to define your network using the `network_weights` function followed by SGD with backprop:

In [9]:
W = network_weights(vocab_size=len(vocab),embedding_dim=300,hidden_dim=[], num_classes=3)

for i in range(len(W)):
    print('Shape W'+str(i), W[i].shape)

W, loss_tr, dev_loss = SGD(X_tr, Y_tr,
                            W,
                            X_dev=X_dev, 
                            Y_dev=Y_dev,
                            lr=0.001, 
                            dropout=0.2,
                            freeze_emb=False,
                            tolerance=0.01,
                            epochs=100)


Plot the learning process:

In [None]:
def plt_loss(y1,y2,epochs):
    x=np.linspace(1,epochs,epochs)
    plt.title('Training Monitoring')
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    #plt.ylim(0.2, 0.7)
    plt.plot(x, y1, color="blue", linewidth=1.0, linestyle="-",label='training loss')
    plt.plot(x, y2, color="red", linewidth=1.0, linestyle="-",label='validation loss')
    plt.legend(loc='upper left', bbox_to_anchor=(0.6, 0.95))
    plt.show()

In [None]:
plt_loss(training_loss_history, validation_loss_history,30)

Compute accuracy, precision, recall and F1-Score:

In [97]:
preds_te = [np.argmax(forward_pass(x, W_1, dropout_rate=0.0)['y'])+1 for x,y in zip(X_test,data_test_y)]
print('Accuracy:', accuracy_score(data_test_y,preds_te))
print('Precision:', precision_score(data_test_y,preds_te,average='macro'))
print('Recall:', recall_score(data_test_y,preds_te,average='macro'))
print('F1-Score:', f1_score(data_test_y,preds_te,average='macro'))

Accuracy: 0.3333333333333333
Precision: 0.1111111111111111
Recall: 0.3333333333333333
F1-Score: 0.16666666666666666


  del sys.path[0]
  del sys.path[0]


### Discuss how did you choose model hyperparameters ? 

# Use Pre-trained Embeddings

Now re-train the network using GloVe pre-trained embeddings. You need to modify the `backward_pass` function above to stop computing gradients and updating weights of the embedding matrix.

Use the function below to obtain the embedding martix for your vocabulary.

In [32]:
def get_glove_embeddings(f_zip, f_txt, word2id, emb_size=300):
    
    w_emb = np.zeros((len(word2id), emb_size))
    
    with zipfile.ZipFile(f_zip) as z:
        with z.open(f_txt) as f:
            for line in f:
                line = line.decode('utf-8')
                word = line.split()[0]
                     
                if word in vocab:
                    emb = np.array(line.strip('\n').split()[1:]).astype(np.float32)
                    w_emb[word2id[word]] +=emb
    return w_emb

In [33]:
w_glove = get_glove_embeddings("glove.840B.300d.zip","glove.840B.300d.txt",word2id)

First, initialise the weights of your network using the `network_weights` function. Second, replace the weigths of the embedding matrix with `w_glove`. Finally, train the network by freezing the embedding weights: 

In [None]:
X_test
data_test_y

In [38]:
preds_te = [np.argmax(forward_pass(x, W, dropout_rate=0.0)['y'])+1 for x,y in zip(X_test,data_test_y)]
print('Accuracy:', accuracy_score(data_test_y,preds_te))
print('Precision:', precision_score(data_test_y,preds_te,average='macro'))
print('Recall:', recall_score(data_test_y,preds_te,average='macro'))
print('F1-Score:', f1_score(data_test_y,preds_te,average='macro'))

Accuracy: 0.33555555555555555
Precision: 0.3465338791854702
Recall: 0.33555555555555555
F1-Score: 0.32819034409358355


### Discuss how did you choose model hyperparameters ? 

# Extend to support deeper architectures (Bonus)

Extend the network to support back-propagation for more hidden layers. You need to modify the `backward_pass` function above to compute gradients and update the weights between intermediate hidden layers. Finally, train and evaluate a network with a deeper architecture. 

In [13]:
preds_te = [np.argmax(forward_pass(x, W, dropout_rate=0.0)['y']) for x,y in zip(X_te,Y_te)]
print('Accuracy:', accuracy_score(Y_te,preds_te))
print('Precision:', precision_score(Y_te,preds_te,average='macro'))
print('Recall:', recall_score(Y_te,preds_te,average='macro'))
print('F1-Score:', f1_score(Y_te,preds_te,average='macro'))

## Full Results

Add your final results here:

| Model | Precision  | Recall  | F1-Score  | Accuracy
|:-:|:-:|:-:|:-:|:-:|
| Average Embedding  |   |   |   |   |
| Average Embedding (Pre-trained)  |   |   |   |   |
| Average Embedding (Pre-trained) + X hidden layers (BONUS)   |   |   |   |   |
