# Fenosoa Randrianjatovo


The goal of this lab is to implement a language identifier (LID).

Our first model will be based on Naive Bayes.

In [14]:
import io, sys, math, re
import numpy as np
from collections import defaultdict

The next function is used to load the data. Each line of the data consist of a label (corresponding to a language), followed by some text, written in that language. Here is an example of data:

```__label__de Zur Namensdeutung gibt es mehrere Varianten.```


In [6]:
def load_data(filename):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    for line in fin:
        tokens = line.split()
        data.append((tokens[0], tokens[1:]))
    return data

You can now try loading the first dataset `train1.txt` and look what examples look like.

In [10]:
data = load_data("./data/train1.txt")
print(data[7], "\n"*2, data[1], "\n"*2, data[9])

('__label__en', ['My', 'fathers', "don't", 'speak', 'Dutch.']) 

 ('__label__de', ['Tom', 'ist', 'an', 'Kunst', 'völlig', 'uninteressiert.']) 

 ('__label__ru', ['Она', 'думала,', 'что', 'он', 'переночует', 'у', 'неё.'])


Next, we will start implementing the Naive Bayes method. This technique is based on word counts, and we thus need to start by implementing a function to count the words and labels of our training set.

`n_examples` is the total number of examples

`n_words_per_label` is the total number of words for a given label

`label_counts` is the number of times a given label appears in the training data

`word_counts` is the number of times a word appears with a given label

In [85]:
def count_words(data):
    n_examples = 0
    n_words_per_label = defaultdict(lambda: 0)
    label_counts = defaultdict(lambda: 0)
    word_counts = defaultdict(lambda: defaultdict(lambda: 0.0))

    

    for ex_index, example in enumerate(data):
        label, sentence = example
        ## FILL CODE
        # return sentence
        n_examples += 1
        label_counts[label] += 1
        
        for index , words in enumerate(sentence):
            # if index>10 and len(data[ex_index][1])>40:
                # return n_examples, label_counts, word_counts, n_words_per_label
            n_words_per_label[label] += 1
            word_counts[label][words] += 1.0
          
    return {'label_counts': label_counts, 
            'word_counts': word_counts, 
            'n_examples': n_examples, 
            'n_words_per_label': n_words_per_label}


# count_words(data)
# if "__main__"==__name__ :
#     count_words(data)


Next, using the word and label counts from the previous function, we can implement the prediction function.

Here, `mu` is a regularization parameter (Laplace smoothing), and `sentence` is the list of words corresponding to the test example.

In [72]:
def predict(sentence, mu, label_counts, word_counts, n_examples, n_words_per_label):
    best_label = None
    best_score = float('-inf')

    for label in word_counts.keys():
        score = 0.0
        ## FILE CODE
        for words in sentence:
            V=len(word_counts[label])
            score += np.log((mu + word_counts[label][words] ) / (mu*V + n_words_per_label[label]))

        if  best_score< score:
            best_score = score
            best_label = label

    return best_label

The next function will be used to evaluate the Naive Bayes model on a validation set. It computes the accuracy for a particular regularization parameter `mu`.

In [73]:
def compute_accuracy(valid_data, mu, counts):
    accuracy = 0.0
    N= len(valid_data)
    label_counts=counts["label_counts"]
    word_counts=counts["word_counts"] 
    n_examples=counts["n_examples"]
    n_words_per_label=counts["n_words_per_label"]
    for label, sentence in valid_data:
        ## FILL CODE
        prediction = predict(sentence, mu, label_counts, word_counts, n_examples, n_words_per_label)
        if prediction == label:
            accuracy += 1.0
            
    accuracy/=N
    return accuracy*100

In [81]:
print("")
print("** Naive Bayes **")
print("")

mu = 1.0
train_data = load_data("./data/train1.txt")
valid_data = load_data("./data/valid1.txt")
counts = count_words(train_data)
print("Validation accuracy: %.2f%s" % (compute_accuracy(valid_data, mu, counts),"%"))


** Naive Bayes **

Validation accuracy: 91.50%


In [82]:
print("")
print("** Naive Bayes **")
print("")

mu = 1.0
train_data = load_data("./data/train2.txt")
valid_data = load_data("./data/valid2.txt")
counts = count_words(train_data)
print("Validation accuracy: %.2f%s" % (compute_accuracy(valid_data, mu, counts),"%"))


** Naive Bayes **

Validation accuracy: 96.60%
