<a href="https://colab.research.google.com/github/Dagobert42/langID-NLP/blob/main/langID_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import re
from nltk import ngrams
from collections import defaultdict
import nltk
import collections
import torch

In [None]:
# read data
# this was written for the WiLI-2018 data set: https://zenodo.org/record/841984
# make sure txt-files are in the specified directory when running this 
X_train = open('x_train.txt', encoding="utf8").read().split('\n')
Y_train = open('y_train.txt', encoding="utf8").read().split('\n')
labels = pd.read_csv('labels.csv', delimiter = ';')
Y_train =  Y_train[:-1]
X_train =  X_train[:-1]

In [None]:
# remove unnecessary characters from data
extras = '!"$%&/{}[]()=?\\`´*+~#-_.:,;<>|1234567890°-\'' # Characters to remove from data
rx = '[' + re.escape(''.join(extras)) + ']'
x_train = [] 
for example in X_train:
    x_train.append(re.sub(' +', ' ', re.sub(rx, '', example)))

In [None]:
# convert language labels to language Name => 'en' -> 'English'
lab_dict = { labels.loc[i]['Label'] : labels.loc[i]['English'] for i in range(0, len(labels)) }
y_train = [ lab_dict[item] if item != 'nan' else 'Min Nan Chinese' for item in Y_train ]

In [None]:
# ordering sentences by language
lang_corpora = defaultdict(list)
for i in range(len(x_train)):
    lang_corpora[y_train[i]].append(x_train[i])

In [None]:
# creating n-grams for each language
# data has to be a dict of lang : corpus
# returns a dict of lang : n-grams
def n_grams_per_lang(n, data):
    gram_per_lang = defaultdict(list)
    for lang in data.keys():
        for sent in data[lang]:
            gram_per_lang[lang] += [sent[i:i+n] for i in range(len(sent)-n+1)]
    
    return gram_per_lang

In [None]:
# counting and sorting n-grams for each language
# data has to be a dict of lang : n-grams
# returns a sorted dict of lang : {n-gram : count}
def sort_by_tf(data):

    # calculating term frequency of n-grams per language
    tf_per_lang = defaultdict(list)
    for lang in data.keys():
        tf_per_lang[lang] = dict(zip(list(collections.Counter(data[lang]).keys()), list(collections.Counter(data[lang]).values())))

    # sort by term frequency
    sorted_tf_per_lang = defaultdict(list)
    for lang in data.keys():
        sorted_tf_per_lang[lang] = { word : value for word, value in sorted(tf_per_lang[lang].items(), key=lambda item:item[1], reverse=True) }
    
    return sorted_tf_per_lang

In [None]:
# make some n-grams and print examples
for n in range(3, 6):
    ngpl = n_grams_per_lang(n, lang_corpora)
    sorted_tf_per_lang = sort_by_tf(ngpl)

    # print some examples
    languages = ['German', 'English', 'Arabic']
    n_samples = 10
    for lang_key in languages:
        print(lang_key, ':', n, '- grams')
        print(list(sorted_tf_per_lang[lang_key].keys())[:n_samples])
        print(list(sorted_tf_per_lang[lang_key].values())[:n_samples])
    print('##########################')

    latin_languages = ['German', 'English', 'French', 'Spanish', 'Italian', 'Portugese', 'Estonian',
                        'Turkish', 'Romanian', 'Swedish', 'Latin', 'Dutch']
    ng_related = {}
    
    for lang_key in latin_languages:
        for otherlang in latin_languages:
            top20 = list(sorted_tf_per_lang[lang_key].keys())[:n_samples]
            if otherlang == lang_key:
                continue
            else:
                top20x = list(sorted_tf_per_lang[otherlang].keys())[:n_samples]
                # compares the two top 20 lists for common elements:
                common_ngrams = list(set(top20).intersection(top20x))
            
                
                if len(common_ngrams) > 0:
                    ng_related[lang_key] = otherlang
                

    print('common '+n+ '- grams dictionary: ')
    print(ng_related)

German : 3 - grams
['en ', 'er ', ' de', 'der', 'sch', 'ie ', 'che', 'nd ', 'ein', 'ch ']
[3915, 3021, 2241, 1655, 1473, 1336, 1184, 1175, 1101, 1073]
English : 3 - grams
[' th', 'he ', 'the', 'ed ', ' in', ' an', 'nd ', 'and', ' of', 'of ']
[2838, 2765, 2569, 1660, 1371, 1326, 1287, 1277, 1240, 1182]
Arabic : 3 - grams
[' ال', 'الم', 'ية ', 'في ', ' في', 'ة ا', ' من', 'من ', 'ن ا', 'ات ']
[8365, 1595, 1468, 1441, 1424, 1378, 1168, 1086, 1084, 937]
##########################
German : 4 - grams
['der ', ' der', 'und ', ' die', ' und', 'den ', 'die ', 'ten ', 'sche', ' ein']
[1316, 1137, 886, 863, 845, 800, 782, 687, 656, 631]
English : 4 - grams
[' the', 'the ', ' of ', 'and ', ' and', ' in ', 'ing ', ' to ', 'tion', 'ion ']
[2368, 2149, 1179, 1149, 1067, 926, 900, 763, 630, 519]
Arabic : 4 - grams
[' الم', 'ة ال', ' في ', 'ن ال', ' من ', ' وال', 'ي ال', 'ت ال', ' الأ', ' الت']
[1359, 1300, 1289, 1015, 948, 696, 686, 626, 611, 603]
##########################
German : 5 - grams
[' der ',