In [1]:
import nltk
nltk.download('udhr')
from nltk.corpus import udhr

[nltk_data] Downloading package udhr to
[nltk_data]     C:\Users\gauta\AppData\Roaming\nltk_data...
[nltk_data]   Package udhr is already up-to-date!


In [2]:
## to see how many languages exist
#print(f"There are {len(udhr.fileids())} files with the following ids: {udhr.fileids()}")

In [8]:
## ngram extractor

#import typing
# module used for declaring and integrating generic and default types

#def extract_xgrams(text: str, n_vals: typing.List[int]) -> typing.List[str]:
def extract_xgrams(text,n_vals):
    ## extract n grams from text. letter by letter
    xgrams = []
    
    for n in n_vals:
        # if n > len(text) then no ngrams will fit, and we would return an empty list
        if n < len(text):
            for i in range(len(text) - n + 1) :
                ng = text[i:i+n]
                xgrams.append(ng)
        
    return xgrams

text = "test sentence.".lower()

# Extract uni/bi/tri grams
ngrams = extract_xgrams(text, n_vals=range(1,3))

print(ngrams)

['t', 'e', 's', 't', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.', 'te', 'es', 'st', 't ', ' s', 'se', 'en', 'nt', 'te', 'en', 'nc', 'ce', 'e.']


In [11]:
## model

from collections import Counter


def build_model(text, n_vals):
    """
    Build a simple model of probabilities of xgrams of various lengths in a text
    Parms:
        text: the text from which to extract the n_grams
        n_vals: a list of n_gram sizes to extract
    Returns:
        A dictionary of ngrams and their probabilities given the input text
    """
    model = Counter(extract_xgrams(text, n_vals))  
    num_ngrams = sum(model.values())

    for ng in model:
        model[ng] = model[ng] / num_ngrams

    return model

test_model = build_model(text, n_vals=range(1,4))
print({k: v for k, v in sorted(test_model.items(), key=lambda item: item[1], reverse=True)})

{'e': 0.10256410256410256, 't': 0.07692307692307693, 's': 0.05128205128205128, 'n': 0.05128205128205128, 'te': 0.05128205128205128, 'en': 0.05128205128205128, ' ': 0.02564102564102564, 'c': 0.02564102564102564, '.': 0.02564102564102564, 'es': 0.02564102564102564, 'st': 0.02564102564102564, 't ': 0.02564102564102564, ' s': 0.02564102564102564, 'se': 0.02564102564102564, 'nt': 0.02564102564102564, 'nc': 0.02564102564102564, 'ce': 0.02564102564102564, 'e.': 0.02564102564102564, 'tes': 0.02564102564102564, 'est': 0.02564102564102564, 'st ': 0.02564102564102564, 't s': 0.02564102564102564, ' se': 0.02564102564102564, 'sen': 0.02564102564102564, 'ent': 0.02564102564102564, 'nte': 0.02564102564102564, 'ten': 0.02564102564102564, 'enc': 0.02564102564102564, 'nce': 0.02564102564102564, 'ce.': 0.02564102564102564}


In [None]:
## model

from collections import Counter


def build_modelwords(text, n_vals):
    """
    Build a simple model of probabilities of xgrams of various lengths in a text
    Parms:
        text: the text from which to extract the n_grams
        n_vals: a list of n_gram sizes to extract
    Returns:
        A dictionary of ngrams and their probabilities given the input text
    """
    model = Counter(extract_xgrams(text, n_vals))  
    num_ngrams = sum(model.values())

    for ng in model:
        model[ng] = model[ng] / num_ngrams

    return model

test_model = build_model(text, n_vals=range(1,4))
print({k: v for k, v in sorted(test_model.items(), key=lambda item: item[1], reverse=True)})

In [13]:
languages = ['english', 'german', 'dutch', 'french', 'italian', 'spanish']
language_ids = ['English-Latin1', 'German_Deutsch-Latin1', 'Dutch_Nederlands-Latin1', 'French_Francais-Latin1', 'Italian_Italiano-Latin1', 'Spanish_Espanol-Latin1']

# First use this function to find the language file id
def retrieve_fileid_by_first_letter(fileids, letter):
    return [id for id in fileids if id.lower().startswith(letter.lower())]

# Example usage
print(f"Fileids beginning with 'R': {retrieve_fileid_by_first_letter(udhr.fileids(), letter='R')}")

# Then copy-paste the language name and language id into the relevant list:
languages += []
language_ids += []

Fileids beginning with 'R': ['Rarotongan_MaoriCookIslands-Latin1', 'Rhaeto-Romance_Rumantsch-Latin1', 'Romani-Latin1', 'Romani-UTF8', 'Romanian-Latin2', 'Romanian_Romana-Latin2', 'Rukonzo_Konjo-Latin1', 'Rundi_Kirundi-Latin1', 'Runyankore-rukiga_Nkore-kiga-Latin1', 'Russian-Cyrillic', 'Russian-UTF8', 'Russian_Russky-Cyrillic', 'Russian_Russky-UTF8']


In [14]:
raw_texts = {language: udhr.raw(language_id) for language, language_id in zip(languages, language_ids)}
print(raw_texts['english'][:1000]) # Just print the first 1000 characters

# Build a model of each language
models = {language: build_model(text=raw_texts[language], n_vals=range(1,6)) for language in languages}
print(models['german'])

Universal Declaration of Human Rights
Preamble
Whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom, justice and peace in the world, 

Whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind, and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as the highest aspiration of the common people, 

Whereas it is essential, if man is not to be compelled to have recourse, as a last resort, to rebellion against tyranny and oppression, that human rights should be protected by the rule of law, 

Whereas it is essential to promote the development of friendly relations between nations, 

Whereas the peoples of the United Nations have in the Charter reaffirmed their faith in fundamental human rights, in the dignity and worth of the human person and in

In [16]:
import math
import typing
def calculate_cosine(a: typing.Dict[str, float], b: typing.Dict[str, float]) -> float:
    """
    Calculate the cosine between two numeric vectors
    Params:
        a, b: two dictionaries containing items and their corresponding numeric values
        (e.g. ngrams and their corresponding probabilities)
    """
    numerator = sum([a[k]*b[k] for k in a if k in b])
    denominator = (math.sqrt(sum([a[k]**2 for k in a])) * math.sqrt(sum([b[k]**2 for k in b])))
    return numerator / denominator

In [21]:
import typing
def identify_language(
    text: str,
    language_models: typing.Dict[str, typing.Dict[str, float]],
    n_vals: typing.List[int]
    ) -> str:
    """
    Given a text and a dictionary of language models, return the language model 
    whose ngram probabilities best match those of the test text
    Params:
        text: the text whose language we want to identify
        language_models: a Dict of Dicts, where each key is a language name and 
        each value is a dictionary of ngram: probability pairs
        n_vals: a list of n_gram sizes to extract to build a model of the test 
        text; ideally reflect the n_gram sizes used in 'language_models'
    """
    text_model = build_model(text, n_vals)
    language = ""
    max_c = 0
    for m in language_models:
        c = calculate_cosine(language_models[m], text_model)
        print(c)
        # The following line is just for demonstration, and can be deleted
        print(f'Language: {m}; similarity with test text: {c}')
        if c > max_c:
            max_c = c
            language = m
    return language




text = "kjsdcgh hdcgaey lhveriuf"
print(f"Test text: {text}")
print(f"Identified language: {identify_language(text, models, n_vals=range(1,6))}")

# Prints
# Test text: i was taught that the way of progress was neither swift nor easy.
# Language: english; similarity with test text: 0.7812347488239613
# Language: german; similarity with test text: 0.6638235631734796
# Language: dutch; similarity with test text: 0.6495872103674768
# Language: french; similarity with test text: 0.7073331083503462
# Language: italian; similarity with test text: 0.6635204671187273
# Language: spanish; similarity with test text: 0.6811923819801172
# Identified language: english

Test text: kjsdcgh hdcgaey lhveriuf
0.35605226485925373
Language: english; similarity with test text: 0.35605226485925373
0.37624383798754973
Language: german; similarity with test text: 0.37624383798754973
0.34725473222005715
Language: dutch; similarity with test text: 0.34725473222005715
0.3198395538113705
Language: french; similarity with test text: 0.3198395538113705
0.32057205121253096
Language: italian; similarity with test text: 0.32057205121253096
0.34466570224668425
Language: spanish; similarity with test text: 0.34466570224668425
Identified language: german
