#Uralic Language Identification Task - VarDial2021 - Part 6

This notebook contains the code developed by Team Phlyers for Track 1 and 2 of the ULI shared task at VarDial2021.

The first few blocks are needed to set up the directory.

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/Colab Notebooks/ULI-VarDial2021

/content/drive/My Drive/Colab Notebooks/ULI-VarDial2021


This block contains the two classifiers we decided to use for the task (a SVM classifier to distinguish between 'target' and 'non-target' languages, and a NB classifier to distinguish among 'target' languages).

In [None]:
import json
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

def svm(train, test):
    # Vectorize training set
    vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,4), max_features=100000)
    scaler = StandardScaler(with_mean=False)
    X_train = vectorizer.fit_transform([sentence for key,sentence in train])
    X_train = scaler.fit_transform(X_train)
    y_train = [key for key,sentence in train]
    print('Rows x: ' + str(X_train.shape[0]))
    print('Columns x: ' + str(X_train.shape[1]))
    print('Labels y: ' + str(len(y_train)))
    # Train a Naive Bayes classifier
    model = SGDClassifier(max_iter=7000)
    #model = MultinomialNB(alpha=0.00000001)
    model.fit(X_train, y_train)
    # Vectorize Evaluation
    X_test = vectorizer.transform(test)
    X_test = scaler.transform(X_test)
    # Predict
    ypred = model.predict(X_test)
    return list(ypred)

def mnb(train, submission, alpha, range, min):
    # Vectorize training set
    vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(range), min_df=min, sublinear_tf=True)
    X_train = vectorizer.fit_transform([sentence for key,sentence in train])
    y_train = [key for key,sentence in train]
    print('Rows x: ' + str(X_train.shape[0]))
    print('Columns x: ' + str(X_train.shape[1]))
    print('Labels y: ' + str(len(y_train)))
    # Train a Naive Bayes classifier
    model = MultinomialNB(alpha=alpha)
    model.fit(X_train, y_train)
    print('Model fitted')
    # Vectorize Evaluation
    X_test = vectorizer.transform(submission)
    print('Test set vectorized.')
    # Predict
    ypred = model.predict(X_test)
    # Calculate F-score globally and print F-score per category
    print('Predictions have been made.')
    return list(ypred)









This block loads the data.


In [None]:
# The corpus is stored in a dictionary in json format
# Dictionary format: {category:{language:[list of texts]}}
with open('data.json') as f:
  data = json.load(f)

# Dataset is in the format of a tuple (category, lang, sentence)
dataset = []

for category in data:
  for lang in data[category]:
    for sentence in data[category][lang]:
      dataset.append((category, lang, sentence))

print("Length of the dataset:")
print(len(dataset))

training = [(category, sentence) for category, _, sentence in dataset]

test_sentences = []
for line in open('test.txt', encoding='utf-8', errors='ignore'):
    test_sentences.append(' '.join(line.lower().translate(line.maketrans('', '', string.punctuation+'|-0123456789”„…'+'\t')).replace('http', '').replace('www', '').split()))


Length of the dataset:
1391043


This block runs the SVM on the data to single out 'target' languages.

In [None]:
y_eval_pred = svm(training, test_sentences)



Rows x: 1391043
Columns x: 100000
Labels y: 1391043


This block runs the MNB on the singled out sentences.

In [None]:
#extract (language,sentence) for all the target languages in the training dataset
training_ural = [(language,sentence) for category, language, sentence in dataset if category == "UR"]



#extract all sentences which are predicted to be ULI
test_ural=[]

for predicted, sentence in zip(y_eval_pred, test_sentences):
    if predicted == "UR":
        test_ural.append(sentence)


#predict the languages for only ULI languages
y_ural_predict = mnb(training_ural, test_ural, 0.0000001, (3,5), 0.000001)




Rows x: 646043
Columns x: 1677212
Labels y: 646043
Model fitted
Test set vectorized.
Predictions have been made.


Combine the predictions of the two classifiers.

In [None]:
predicted_labels = []

for prediction in y_eval_pred:
  if prediction == "UR":
    # if the prediction of the SVM classifier is that of a target language, retrieve the language label from the MNB predictions
    predicted_labels.append(y_ural_predict.pop(0))
  else:
    # if the prediction of the SVM classifier is that of a non-target language, then predict 'NA'
    predicted_labels.append(prediction)

with open('ULI-track-1-Phlyers.txt', 'w') as f:
  for label in predicted_labels:
    f.write(label + '\n')


