#Uralic Language Identification Task - VarDial2021 - Part 1"

This notebook contains the code developed by Team Phlyers to extract the Wanca 2017 Corpus (Jauhianen et al. 2020) for the ULI shared task at VarDial2021.

The first few blocks are needed to set up the directory.

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/Colab Notebooks/ULI-VarDial2021

/content/drive/My Drive/Colab Notebooks/ULI-VarDial2021


This block loads the data and stores them in a dictionary.

In [None]:
from collections import defaultdict
import os
import string
import json

###############
#Load the data
###############

# Dictionary format: {category:{language:[list of texts]}}
lang_dic=defaultdict(dict)

for file in os.listdir('ULI_data'):
    print(file, len(file))
    # All files of length 7 contain the 'target' languages
    if len(file) == 7:
      # We retrieve the name of the language
        name = file[:3]
        sentences=[]
        for line in open('ULI_data/' + file, encoding='utf-8', errors='ignore'):
            sentences.append(' '.join(line.lower().translate(line.maketrans('', '', string.punctuation+'|-0123456789”„…'+'\t')).split()[:-1]))
        # 'UR' is the label we attribute to the target language category
        lang_dic['UR'][name] = sentences
    # The range covers all the files containing 'non-target' languages
    elif len(file) in range(29,42):
        sentences=[]
        # We retrieve the name of the language
        name = file[:3]
        i = 0
        for line in open('ULI_data/' + file, encoding='utf-8', errors='ignore'):
            if i<5000:
                sentences.append(' '.join(line.lower().translate(line.maketrans('', '', string.punctuation+'|-0123456789”„…'+'\t')).replace('http', '').replace('www', '').split()))
                i+=1
        # 'Na' is the label we attribute to the non-target language category
        lang_dic['Na'][name] = sentences

print('Categories:')
print(lang_dic.keys())
print('# of Target langs:')
print(len(lang_dic['UR']))
print('# of Non-target langs:')
print(len(lang_dic['Na']))
print('Target langs:')
print(lang_dic['UR'].keys())
print('Non-Target langs:')
print(lang_dic['Na'].keys())
print("Examples:")
print(lang_dic['UR']['nio'][:5])
print(lang_dic['UR']['sme'][:5])
print(lang_dic['Na']['ceb'][:5])
print(lang_dic['Na']['fin'][:5])


# Dump the dictionary containing the corpus into a json file
with open('data.json', 'w') as fp:
    json.dump(lang_dic, fp)

