In [2]:
#Imports
import json
import numpy
import sklearn 
#Base Naive Bayes classifier:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from evaluation import compute_macro_f1_score
from evaluation import compute_micro_f1_score

In [3]:
import json
with open('data/train.json', 'r', encoding='utf-8') as fp:
        train_data = json.load(fp)
print(len(train_data ))

with open('data/valid.json', 'r', encoding='utf-8') as fp:
        valid_old = json.load(fp)

with open('data/valid_new.json', 'r', encoding='utf-8') as fp:
        valid_new = json.load(fp)

800003


## Setting up the predicted variable (language)

In [4]:
languages = set();
for i in train_data:
    languages.add(i['langid'])

languages = list(languages); #convert it into a list.
langind = {};
for i in languages:
    langind[i] = len(langind)
langname = {};
for i in langind:
    langname[langind[i]] = i;

### Applying a Simple NaiveBayes classifier at the start

In [5]:
def my_tokenize(text): #A global function for consistent tokenization.
    return text.lower().split(); #simple for now.

def get_vocab(data):
    vocab = set();
    for i in range(len(data)):
        strlist = my_tokenize(data[i]['text']); #The string we need to determine the language of.
        for j in strlist:
            vocab.add(j); #we add all the words to the vocabulary.
    return vocab
vocab = get_vocab(train_data)

In [6]:
#we get each word in the vocabulary and assign it a unique index.
wordind = {};
for i in vocab:
    wordind[i] = len(wordind)

In [7]:
## Then we create a matrix of the size of the vocabulary and the number of languages.
## This denotes the frequencies of each word in the vocabulary for each of the languages.
def get_freq_matrix(data, wordind, langind):
    matrix = numpy.zeros((len(wordind), len(langind)));
    for i in range(len(data)):
        strlist = my_tokenize(data[i]['text'])
        for j in strlist:
            matrix[wordind[j]][langind[data[i]['langid']]] += 1;
    return matrix

train_matrix = get_freq_matrix(train_data, wordind, langind)

In [9]:
def get_multinomialNB(train_dict_x, train_y):
    clf = MultinomialNB()
    clf.fit(train_dict_x, train_y)
    return clf
vec1 = CountVectorizer(tokenizer=my_tokenize)
vec = TfidfTransformer()

X_train = [i['text'] for i in train_data]
y_train = [i['langid'] for i in train_data]
naivebayes = make_pipeline(vec1, vec, MultinomialNB());
naivebayes.fit(X_train, y_train)
# X_train_cnt = vec1.fit_transform(X_train)
# X_train_cnt = vec.fit_transform(X_train_cnt)
# naivebayes = get_multinomialNB(X_train_cnt, y_train)



In [11]:
X_valid_old = [i['text'] for i in valid_old]
# X_valid_old_cnt = vec.transform(X_valid_old)
y_valid_old = [i['langid'] for i in valid_old]
X_valid_new = [i['text'] for i in valid_new]
# X_valid_new_cnt = vec.transform(X_valid_new)
y_valid_new = [i['langid'] for i in valid_new]

In [12]:
def run_tests(clf):
    y_pred_old_valid = clf.predict(X_valid_old)
    y_pred_new_valid = clf.predict(X_valid_new)
    y_pred_train = clf.predict(X_train)
    print("accuracies: ", accuracy_score(y_valid_old, y_pred_old_valid), accuracy_score(y_valid_new, y_pred_new_valid), accuracy_score(y_train, y_pred_train))
    #print("confusion matrix: ", confusion_matrix(y_valid_old, y_pred_old_valid), confusion_matrix(y_valid_new, y_pred_new_valid), confusion_matrix(y_train, y_pred_train))
    print("micro F1 scores: ", compute_micro_f1_score(y_valid_old, y_pred_old_valid), compute_micro_f1_score(y_valid_new, y_pred_new_valid), compute_micro_f1_score(y_train, y_pred_train))
    print("macro F1 scores: ", compute_macro_f1_score(y_valid_old, y_pred_old_valid), compute_macro_f1_score(y_valid_new, y_pred_new_valid), compute_macro_f1_score(y_train, y_pred_train))
    

In [13]:
run_tests(naivebayes)

accuracies:  0.96149 0.7686017988552739 0.9697038636105114
micro F1 scores:  0.9365013933088199 0.6552984165651644 0.9500167044062512
macro F1 scores:  0.7468627061170287 0.5594827395976023 0.7768699200593804
