# AMMI NLP - Part 1
## Lab 1: Introduction to text classification 


### Section 1: Text Classification with Naive Bayes Classifier 

In this part you'll implement naive Bayes classifier to classify the text. 
you need to build a model that predicts the langauge of the text given the words of the text 

In [1]:
import io, sys, math, re
from collections import defaultdict
import numpy as np

In [2]:
def load_data(filename):
    '''
    Parameters:
    filename (string): path to file to be read
    
    Return: 
    List of tuples (explained in first question)
    '''
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    for line in fin:
        tokens = line.split()
        data.append((tokens[0], tokens[1:]))
    return data

In [3]:
data = load_data("train1.txt")
data[0]
# Tuple 

('__label__deu',
 ['Ich', 'würde', 'alles', 'tun,', 'um', 'dich', 'zu', 'beschützen.'])

In [4]:
def count_words(data):
    '''
    Parameters:
    
    data is  list of [(label, words), (label, worlds), ......]
    list of tuples in the shape (string, [list of strings]) )
    
    Returns: 
    
    This function should return a dictionary containing the following:
    { 
    # label_counts (python dictionary): 
         {label:  no. of times the label appeared },
    # word_counts  (dictionary of dictionaries): 
         {label: {word: no. of times this word appeared with this label }},
    # label_total (int): 
        total number of labels. (size of train data),
    # word_total  (python dictionary) total number of words (from the entire corupus) of the particular label:
          {label: no.of words}
          
          }
    
    '''
    label_total = 0
    word_total = defaultdict(lambda: 0)
    label_counts = defaultdict(lambda: 0)
    word_counts = defaultdict(lambda: defaultdict(lambda: 0.0))

    for example in data:
        label, sentence = example
        ## FILL CODE


    return {'label_counts': label_counts, 
            'word_counts': word_counts, 
            'label_total': label_total, 
            'word_total': word_total}

In [5]:
def predict(sentence, mu, label_counts, word_counts, label_total, word_total):
    '''
     Parameters: 
        sentence (string): sentence to be classified
        mu (positive real number): Laplace Smoothing hyperparameter
        ** The other parameters introduced in the count_words function
    
    Returns:
    best_label (string): the label that has the highest score. 
    
    Implement the function to predict the best label for the given sentence using Naive Bayes algorithm 
    
    '''
    best_label = None
    best_score = float('-inf')

    for label in word_counts.keys():
        score = 0.0
        ## FILL CODE

    return best_label


In [6]:
def compute_accuracy(valid_data, mu, counts):
    '''
    Parameters:
    valid_data (list of tuples): returned value of load_data function 
    mu (positive real): Laplace smoothing hyper-parameter
    counts (dictionary of dictionaries): return value of count_words_function
    
    Returns: 
    accuracy (float): the accuracy of the Naive Bayes classifier
    '''
    accuracy = 0.0
    for label, sentence in valid_data:
         ## FILL CODE
            pass 

    return accuracy


In [7]:
print("")
print("** Naive Bayes **")
print("")

mu = 1.0
train_data = load_data("train1.txt")
valid_data = load_data("valid1.txt")
counts = count_words(train_data)

print("Validation accuracy: %.3f" % compute_accuracy(valid_data, mu, counts))
print("")



** Naive Bayes **

Validation accuracy: 0.000



## Section 2: Softmax Classification of Text 

In [8]:
def build_dict(filename, threshold=1):
    fin = io.open(filename, 'r', encoding='utf-8')
    word_dict, label_dict = {}, {}
    counts = defaultdict(lambda: 0)
    for line in fin:
        tokens = line.split()
        label = tokens[0]

        if not label in label_dict:
            label_dict[label] = len(label_dict)

        for w in tokens[1:]:
            counts[w] += 1
            
    for k, v in counts.items():
        if v > threshold:
            word_dict[k] = len(word_dict)
    return word_dict, label_dict

In [9]:
word_dict, label_dict = build_dict("train1.txt")

In [10]:
def load_data(filename, word_dict, label_dict):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    dim = len(word_dict)
    for line in fin:
        tokens = line.split()
        label = tokens[0]

        yi = label_dict[label]
        xi = np.zeros(dim)
        for word in tokens[1:]:
            if word in word_dict:
                wid = word_dict[word]
                xi[wid] += 1.0
        data.append((yi, xi))
    return data

In [11]:
d = load_data("train1.txt", word_dict, label_dict)

In [12]:
def softmax(x):
    ## FILL CODE
    m = x.max()
    y = np.exp(x - m)
    return y / np.sum(y)
    #return None

In [25]:
def sgd(w, data, niter):
    nlabels, dim = w.shape
    for iter in range(niter):
        ## FILL CODE
        train_loss = 0.0
        for yi, xi in train_data:
            # We compute the prediction of model and loss
            prediction = softmax(np.dot(w, xi))
            train_loss += -math.log(prediction[yi])
            # We compute the gradient w.r.t. to w
            target = np.zeros(nlabels)
            target[yi] = 1.0
            error = prediction - target
            gradient = error.reshape((nlabels, 1)) * xi.reshape((1, dim))
            # We apply the gradient step
            w = w - 0.5 * gradient
        print("Iter: %02d    Loss: %.4f" % (iter, train_loss / len(data)))
        
    return w

In [26]:
def predict(w, x):
    ## FILL CODE
    return np.argmax(softmax(np.dot(w, x)))
    #return None

In [27]:
def compute_accuracy(w, valid_data):
    ## FILL CODE
    accuracy = 0.0
    for yi, xi in valid_data:
        yp = predict(w, xi)
        if yp == yi:
            accuracy += 1.0
    return accuracy / len(valid_data)
    #return 0.0

In [28]:
print("")
print("** Logistic Regression **")
print("")

word_dict, label_dict = build_dict("train1.txt")
train_data = load_data("train1.txt", word_dict, label_dict)
valid_data = load_data("valid1.txt", word_dict, label_dict)

nlabels = len(label_dict)
dim = len(word_dict)
w = np.zeros([nlabels, dim])
w = sgd(w, train_data, 5)
print("")
print("Validation accuracy: %.3f" % compute_accuracy(w, valid_data))
print("")


** Logistic Regression **

Iter: 00    Loss: 0.4906
Iter: 01    Loss: 0.2052
Iter: 02    Loss: 0.1557
Iter: 03    Loss: 0.1313
Iter: 04    Loss: 0.1167

Validation accuracy: 0.933

