# Santatriniaina Avotra Randrianambinina

In this second part of the lab, we will implement a language identifier trained on the same data, but using Logistic Regression instead of Naive Bayes.

In [1]:
import io, sys, math
import numpy as np
from collections import defaultdict

This function is used to build the dictionary, or vocabulary, which is a mapping from strings (or words) to integers (or indices). This will allow to build vector representations of documents. 

In [2]:
def build_dict(filename, threshold=1):
    fin = io.open(filename, 'r', encoding='utf-8')
    word_dict, label_dict = {}, {}
    counts = defaultdict(lambda: 0)
    for line in fin:
        tokens = line.split()
        label = tokens[0]

        if not label in label_dict:
            label_dict[label] = len(label_dict)

        for w in tokens[1:]:
            counts[w] += 1
            
    for k, v in counts.items():
        if v > threshold:
            word_dict[k] = len(word_dict)
    return word_dict, label_dict

This function is used to load the training dataset, and build vector representations of the training examples. In particular, a document or sentence is represented as a bag of words. Each example correspond to a sparse vector ` x` of dimension `V`, where `V` is the size of the vocabulary. The element `j` of the vector `x` is the number of times the word `j` appears in the document.

In [3]:
def load_data(filename, word_dict, label_dict):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    dim = len(word_dict)
    for line in fin:
        tokens = line.split()
        label = tokens[0]

        yi = label_dict[label]
        xi = np.zeros(dim)
        for word in tokens[1:]:
            if word in word_dict:
                wid = word_dict[word]
                xi[wid] += 1.0
        data.append((yi, xi))
    return data

In [4]:
word_dict1, label_dict1 = build_dict("./data/train1.txt")

In [5]:
mydata1=load_data("./data/train1.txt", word_dict1, label_dict1)
mydata1[:10]

[(0, array([1., 1., 1., ..., 0., 0., 0.])),
 (0, array([0., 0., 0., ..., 0., 0., 0.])),
 (1, array([0., 0., 0., ..., 0., 0., 0.])),
 (0, array([0., 0., 0., ..., 0., 0., 0.])),
 (2, array([0., 0., 0., ..., 0., 0., 0.])),
 (3, array([0., 0., 0., ..., 0., 0., 0.])),
 (2, array([0., 0., 0., ..., 0., 0., 0.])),
 (4, array([0., 0., 0., ..., 0., 0., 0.])),
 (5, array([0., 0., 0., ..., 0., 0., 0.])),
 (2, array([0., 0., 0., ..., 0., 0., 0.]))]

In [6]:
word_dict2, label_dict2 = build_dict("./data/train2.txt")

In [7]:
mydata2=load_data("./data/train2.txt", word_dict2, label_dict2)
mydata2[:10]

[(0, array([1., 1., 1., ..., 0., 0., 0.])),
 (0, array([0., 0., 0., ..., 0., 0., 0.])),
 (1, array([0., 0., 0., ..., 0., 0., 0.])),
 (0, array([0., 0., 0., ..., 0., 0., 0.])),
 (2, array([0., 0., 0., ..., 0., 0., 0.])),
 (3, array([0., 0., 0., ..., 0., 0., 0.])),
 (2, array([0., 0., 0., ..., 0., 0., 0.])),
 (4, array([0., 0., 0., ..., 0., 0., 0.])),
 (5, array([0., 0., 0., ..., 0., 0., 0.])),
 (2, array([0., 0., 0., ..., 0., 0., 0.]))]

First, let's implement the softmax function. Don't forget numerical stability!

In [8]:
def softmax(x):
    ### FILL CODE
    z=x.max()
    return np.exp(x-z)/np.sum(np.exp(x-z))

Now, let's implement the main training loop, by using stochastic gradient descent. The function will iterate over the examples of the training set. For each example, we will first compute the loss, before computing the gradient and performing the update.

In [9]:
def sgd(w, data, niter):
    nlabels, dim = w.shape
    eta_t=.2
    for iter in range(niter):
        ### FILL CODE
        train_loss = 0.0
        for yi, xi in data:
            o = softmax(np.dot(w, xi))
            loss = -np.log(o[yi])/len(o)
            train_loss += loss
            target = np.zeros(nlabels)
            target[yi] = 1.0
        
            grad = (target - o).reshape(nlabels, 1) @ xi.reshape(1, -1)
            
            w = w+ eta_t * grad
            
        print("Iter: "+ str(iter)+" ---> "+" Loss: " + str(train_loss/len(data)))
        
    return w

The next function will predict the most probable label corresponding to example `x`, given the trained classifier `w`.

In [10]:
def predict(w, x):
    ## FILL CODE
    prediction = np.argmax(softmax(np.dot(w, x)))
    return prediction

Finally, this function will compute the accuracy of a trained classifier `w` on a validation set.

In [11]:
def compute_accuracy(w, valid_data):
    ## FILL CODE
    accuracy = 0.0
    N=len(valid_data)
    for yi, xi in valid_data:
        y_pred = predict(w, xi)
        if y_pred == yi:
            accuracy += 1.0
    return accuracy/N

In [12]:
print("")
print("** Logistic Regression **")
print("")

word_dict, label_dict = build_dict("./data/train1.txt")
train_data = load_data("./data/train1.txt", word_dict, label_dict)
valid_data = load_data("./data/valid1.txt", word_dict, label_dict)

nlabels = len(label_dict)
dim = len(word_dict)
w = np.zeros([nlabels, dim])
w = sgd(w, train_data, 15)
print("")
print("Validation accuracy: %.3f" % compute_accuracy(w, valid_data))
print("")


** Logistic Regression **

Iter: 0 --->  Loss: 0.06712482701556115
Iter: 1 --->  Loss: 0.03185184780525946
Iter: 2 --->  Loss: 0.024371179618695994
Iter: 3 --->  Loss: 0.020471371127610764
Iter: 4 --->  Loss: 0.01798955594228999
Iter: 5 --->  Loss: 0.016244737399564453
Iter: 6 --->  Loss: 0.014940822860363065
Iter: 7 --->  Loss: 0.013925236941765305
Iter: 8 --->  Loss: 0.013110303297851646
Iter: 9 --->  Loss: 0.012441530785925177
Iter: 10 --->  Loss: 0.01188302878914523
Iter: 11 --->  Loss: 0.011410032586141167
Iter: 12 --->  Loss: 0.011004796796148809
Iter: 13 --->  Loss: 0.01065420778727414
Iter: 14 --->  Loss: 0.010348326769859702

Validation accuracy: 0.930



In [13]:
print("")
print("** Logistic Regression **")
print("")

word_dict, label_dict = build_dict("./data/train1.txt")
train_data = load_data("./data/train2.txt", word_dict, label_dict)
valid_data = load_data("./data/valid2.txt", word_dict, label_dict)

nlabels = len(label_dict)
dim = len(word_dict)
w = np.zeros([nlabels, dim])
w = sgd(w, train_data, 15)
print("")
print("Validation accuracy: %.3f" % compute_accuracy(w, valid_data))
print("")


** Logistic Regression **

Iter: 0 --->  Loss: 0.030828269664962958
Iter: 1 --->  Loss: 0.018644590408633328
Iter: 2 --->  Loss: 0.016470928182644003
Iter: 3 --->  Loss: 0.015358124170525646
Iter: 4 --->  Loss: 0.014659626854255826
Iter: 5 --->  Loss: 0.01417326269338114
Iter: 6 --->  Loss: 0.01381225784242773
Iter: 7 --->  Loss: 0.013532442681054375
Iter: 8 --->  Loss: 0.013308693619250682
Iter: 9 --->  Loss: 0.013125614024081913
Iter: 10 --->  Loss: 0.012973218360321982
Iter: 11 --->  Loss: 0.012844547160444317
Iter: 12 --->  Loss: 0.012734550659137726
Iter: 13 --->  Loss: 0.012639534012968698
Iter: 14 --->  Loss: 0.012556728041093949

Validation accuracy: 0.940

