#Uralic Language Identification Task - VarDial2021 - Part 5

This notebook contains the code developed by Team Phlyers for Track 1 and 2 of the ULI shared task at VarDial2021.

The first few blocks are needed to set up the directory.

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/Colab Notebooks/ULI-VarDial2021

/content/drive/My Drive/Colab Notebooks/ULI-VarDial2021


This block contains the two classifiers we decided to use for the task (a SVM classifier to distinguish between 'target' and 'non-target' languages, and a NB classifier to distinguish among 'target' languages).

In [None]:
import json
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler


def svm(train, eval):
    # Vectorize training set
    vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,4), max_features=100000)
    scaler = StandardScaler(with_mean=False)
    X_train = vectorizer.fit_transform([sentence for key,sentence in train])
    X_train = scaler.fit_transform(X_train)
    y_train = [key for key,sentence in train]
    print('Rows x: ' + str(X_train.shape[0]))
    print('Columns x: ' + str(X_train.shape[1]))
    print('Labels y: ' + str(len(y_train)))
    # Train a SVM classifier
    model = SGDClassifier(max_iter=7000)
    model.fit(X_train, y_train)
    # Vectorize Evaluation
    X_eval = vectorizer.transform([sentence for key, sentence in eval])
    X_eval = scaler.transform(X_eval)
    y_eval = [key for key,sentence in eval]
    # Predict
    ypred = model.predict(X_eval)
    accuracy = f1_score(y_eval, ypred, average='macro')
    # Calculate F-score globally and print F-score per category
    print('F1_score:')
    print(accuracy)
    print('F1_score per category:')
    print(f1_score(y_eval, ypred, average=None))
    return list(ypred)

def mnb(train, submission, alpha, range, min):
    # Vectorize training set
    vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(range), min_df=min, sublinear_tf=True)
    X_train = vectorizer.fit_transform([sentence for key,sentence in train])
    y_train = [key for key,sentence in train]
    print('Rows x: ' + str(X_train.shape[0]))
    print('Columns x: ' + str(X_train.shape[1]))
    print('Labels y: ' + str(len(y_train)))
    # Train a Naive Bayes classifier
    model = MultinomialNB(alpha=alpha)
    model.fit(X_train, y_train)
    print('Model fitted')
    # Vectorize Evaluation
    X_test = vectorizer.transform(submission)
    print('Test set vectorized.')
    # Predict
    ypred = model.predict(X_test)
    # Calculate F-score globally and print F-score per category
    print('Predictions have been made.')
    return list(ypred)



This block loads the data.

In [None]:
# The corpus is stored in a dictionary in json format
# Dictionary format: {category:{language:[list of texts]}}
with open('data.json') as f:
  data = json.load(f)

# Dataset is in the format of a tuple (category, lang, sentence)
dataset = []

for category in data:
  for lang in data[category]:
    for sentence in data[category][lang]:
      dataset.append((category, lang, sentence))

print("Length of the dataset:")
print(len(dataset))

random.shuffle(dataset)

# Split the data in a train and test set
train_list = dataset[len(dataset)//5:]
eval_list = dataset[:len(dataset)//5]


training = [(category, sentence) for category, _, sentence in train_list]
eval = [(category, sentence) for category, _, sentence in eval_list]


Length of the dataset:
1391043


This block runs the SVM on the data to single out 'target' languages.

In [None]:
y_eval_pred = svm(training, eval)


Rows x: 1112835
Columns x: 100000
Labels y: 1112835
F1_score:
0.9954597740791868
F1_score per category:
[0.99580082 0.99511873]


This block runs the MNB on the singled out sentences.

In [None]:
#extract (language,sentence) for all the target languages in the training dataset
training_ural = [(language,sentence) for category, language, sentence in train_list if category == "UR"]

#extract all sentences in the eval dataset which are predicted to be on target
eval_ural = []

for predicted, instance in zip(y_eval_pred, eval_list):
  if predicted == "UR":
    category, language, sentence = instance
    eval_ural.append(sentence)


#extract the true labels of the target languages, while everything else is assigned the default category label 'NA'
true_labels = []

for category, language, sentence in eval_list:
  if category == "UR":
    true_labels.append(language)
  else:
    true_labels.append(category)


y_ural_predict = mnb(training_ural, eval_ural, 0.0000001, (3,5), 0.000001)



Rows x: 517288
Columns x: 1539060
Labels y: 517288
Model fitted
Test set vectorized.
Predictions have been made.


Combine the predictions of the two classifiers, and calculate accuracy.

In [None]:
predicted_labels = []
for prediction in y_eval_pred:
  if prediction == "UR":
    # if the prediction of the SVM classifier is that of a target language, retrieve the language label from the MNB predictions
    predicted_labels.append(y_ural_predict.pop(0))
  else:
    # if the prediction of the SVM classifier is that of a non-target language, then predict 'NA'
    predicted_labels.append(prediction)

print("Macro F1:")
print(f1_score(true_labels, predicted_labels, average="macro"))
print("Micro F1:")
print(f1_score(true_labels, predicted_labels, average="micro"))


Macro F1:
0.9141107943083907
Micro F1:
0.9893029675638372
