In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.probability import FreqDist
from scipy.stats import entropy
from sklearn.metrics import accuracy_score

1. Import data

In [None]:
# Load the data
data = pd.read_csv('formatted_data.csv', delimiter=';')
print(data)

   language                                               text  length_text
0        bg  Състав на Парламента: вж. протоколиОдобряване ...       327263
1        cs  Schválení zápisu z předchozího zasedání: viz z...       317927
2        da  Genoptagelse af sessionenJeg erklærer Europa-P...       678400
3        de  Wiederaufnahme der SitzungsperiodeIch erkläre ...       747690
4        el  Επαvάληψη της συvσδoυΚηρύσσω την επανάληψη της...       523277
5        en  Resumption of the sessionI declare resumed the...       690268
6        es  Reanudación del período de sesionesDeclaro rea...       733658
7        et  Eelmise istungi protokolli kinnitamine vaata p...       324119
8        fi  Istuntokauden uudelleenavaaminen Julistan perj...       694523
9        fr  Reprise de la sessionJe déclare reprise la ses...       756201
10       hu  Az előző ülés jegyzőkönyvének elfogadása: lásd...       330524
11       it  Ripresa della sessioneDichiaro ripresa la sess...       729712
12       lt 

2. Preprocess

In [None]:
# Assuming columns are 'text' and 'language'
# Funtion to clean text
def clean_text(text):
    text=re.sub(r'[^\w\s]','',text)
    text=text.lower()
    text=text.strip()
    return text

data['cleaned_text'] = data['text'].apply(clean_text)
print(data.head())

  language                                               text  length_text  \
0       bg  Състав на Парламента: вж. протоколиОдобряване ...       327263   
1       cs  Schválení zápisu z předchozího zasedání: viz z...       317927   
2       da  Genoptagelse af sessionenJeg erklærer Europa-P...       678400   
3       de  Wiederaufnahme der SitzungsperiodeIch erkläre ...       747690   
4       el  Επαvάληψη της συvσδoυΚηρύσσω την επανάληψη της...       523277   

                                        cleaned_text  
0  състав на парламента вж протоколиодобряване на...  
1  schválení zápisu z předchozího zasedání viz zá...  
2  genoptagelse af sessionenjeg erklærer europapa...  
3  wiederaufnahme der sitzungsperiodeich erkläre ...  
4  επαvάληψη της συvσδoυκηρύσσω την επανάληψη της...  


Spliting data

In [None]:
# Percentage of text to be used for test set (e.g., 20%)
test_size = 0.2

# Create empty lists to hold training and test data
train_rows = []
test_rows = []

for index, row in data.iterrows():
    text = row['cleaned_text']
    split_index = int(len(text) * test_size)  # Determine the split index

    # Step 2: Create test and training samples
    test_data = text[:split_index]  # Take the first 'split_index' characters for the test set
    train_data = text[split_index:]  # The rest goes to the training set

    # Append the results to the lists
    test_rows.append({'cleaned_text': test_data, 'language': row['language']})
    train_rows.append({'cleaned_text': train_data, 'language': row['language']})

# Create DataFrames from the lists
train_data = pd.DataFrame(train_rows)
test_data = pd.DataFrame(test_rows)

# Print the sizes of the training and test sets to verify
print(f"Training set size: {train_data.shape[0]}")
print(f"Test set size: {test_data.shape[0]}")

# Optionally, display the first few rows of each set for verification
print("\nTraining Set:")
print(train_data.head())
print("\nTest Set:")
print(test_data.head())


Training set size: 21
Test set size: 21

Training Set:
                                        cleaned_text language
0  а заседаниетоdie sitzung wird um 900 eröffnetп...       bg
1  ů hlasování34 ochrana údajů v rámci policejní ...       cs
2  gioner ikke letter en regional analyse denne f...       da
3  rahmen zu sein scheintviele ihrer äußerungen m...       de
4  ου προηγήθηκε σχετικά με τα ιδιαίτερα προβλήμα...       el

Test Set:
                                        cleaned_text language
0  състав на парламента вж протоколиодобряване на...       bg
1  schválení zápisu z předchozího zasedání viz zá...       cs
2  genoptagelse af sessionenjeg erklærer europapa...       da
3  wiederaufnahme der sitzungsperiodeich erkläre ...       de
4  επαvάληψη της συvσδoυκηρύσσω την επανάληψη της...       el


3. Build Unigram Language Profile (Character-based unigram) with NLTK

In [None]:
# Get unigram language profile on training set (character frequencies)
unigram_language_profiles = {}

for language in train_data['language'].unique():
    # Join all text in the language group
    text = ''.join(data[data['language'] == language]['cleaned_text'])
    # Calculate frequencies of characters
    char_counts = FreqDist(text)
    total_chars = sum(char_counts.values())
    # Convert to probability distribution
    unigram_distribution = {char: count / total_chars for char, count in char_counts.items()}
    unigram_language_profiles[language] = unigram_distribution
    print(language, unigram_distribution)

bg {'с': 0.03882033362400487, 'ъ': 0.009959159603766601, 'т': 0.05943187307482387, 'а': 0.09423717542549921, 'в': 0.038920709405838104, ' ': 0.14336798388968702, 'н': 0.06127941480919191, 'п': 0.027760177163254934, 'р': 0.04188806845628321, 'л': 0.02393021373768044, 'м': 0.015975433027396315, 'е': 0.07211999924718164, 'ж': 0.00833118989215877, 'о': 0.0768753019115313, 'к': 0.026141617681193968, 'и': 0.07026304728326672, 'д': 0.026078882817548197, 'б': 0.009316127251397419, 'я': 0.012609707592800548, 'ш': 0.002487437343554934, 'з': 0.02100990583496967, 'щ': 0.0052916857485210255, 'у': 0.008751513478585455, 'г': 0.008845615774054115, 'ц': 0.005407745246265707, 'й': 0.003356315205048902, 'ю': 0.001596602279784945, 's': 0.0045074999529488525, 'e': 0.007754029146617649, 'l': 0.002854436295882711, 'v': 0.0005897077182702744, 'a': 0.005323053180343913, 'n': 0.0037829122778401643, 't': 0.0041781419188085395, 'i': 0.003858194114215093, 'ó': 0.0001348799568384138, '1': 0.002465480141278913, '9':

4. Calculate KL Divergence (Entropy)

In [None]:
def calculate_kl_divergence(p, q):
    # p and q are dictionaries with character probabilities
    p_vals, q_vals = [], []
    all_chars = set(p.keys()).union(set(q.keys()))

    for char in all_chars:
        p_vals.append(p.get(char, 1e-10))  # small smoothing factor to avoid log(0)
        q_vals.append(q.get(char, 1e-10))

    # Calculate KL divergence
    return entropy(p_vals, q_vals)


Create function to predict language based on entropy

In [None]:
def predict_language(test_text, unigram_language_profiles):
    test_text = clean_text(test_text)
    test_char_counts = FreqDist(test_text)
    total_test_chars = sum(test_char_counts.values())
    test_text_freqs = {char: count / total_chars for char, count in test_char_counts.items()}

    # Compare the char distribution from training set and char distribution from test set
    divergences = {lang: calculate_kl_divergence(test_text_freqs, lang_freqs) for lang, lang_freqs in unigram_language_profiles.items()}
    return min(divergences, key=divergences.get)  # Select the language with the smallest divergence


5. Test the model using test set

In [None]:
# Predict and evaluate
predictions = []
actuals = []
for index, row in test_data.iterrows():
    prediction = predict_language(row['cleaned_text'], unigram_language_profiles)
    actual = row['language']
    print(f"Predicted: {prediction}, Actual: {actual}")

    predictions.append(prediction)
    actuals.append(actual)

Predicted: bg, Actual: bg
Predicted: cs, Actual: cs
Predicted: da, Actual: da
Predicted: de, Actual: de
Predicted: el, Actual: el
Predicted: en, Actual: en
Predicted: es, Actual: es
Predicted: et, Actual: et
Predicted: fi, Actual: fi
Predicted: fr, Actual: fr
Predicted: hu, Actual: hu
Predicted: it, Actual: it
Predicted: lt, Actual: lt
Predicted: lv, Actual: lv
Predicted: nl, Actual: nl
Predicted: pl, Actual: pl
Predicted: pt, Actual: pt
Predicted: ro, Actual: ro
Predicted: sk, Actual: sk
Predicted: sl, Actual: sl
Predicted: sv, Actual: sv


6. Evalaute Model's Performance

In [None]:
# Produce Accuracy to test the model performance
test_data['predicted_language'] = test_data['cleaned_text'].apply(lambda x: predict_language(x, unigram_language_profiles))
accuracy = accuracy_score(test_data['language'], test_data['predicted_language'])
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 100.00%


Test the model using text from languages_test.txt

In [None]:
# After we have got a model, we will try to apply the model on a random text file.
"""
как у тебя сегодня дела
Az alma nagyon finom volt, de a férfi nem szereti.
Jabłko było bardzo smaczne, ale mężczyźnie nie smakowało.
Университет находился очень далеко, и поездка занимала много времени.
Universitatea era foarte departe și a durat mult timp de călătorie.
L'università era molto lontana e richiedeva un lungo viaggio.
Cosa c'è in un nome? L'universo è in continua espansione. Come lo chiami?
O que tem num nome? O universo está se expandindo o tempo todo. Como você chama isso?
Qu'y a-t-il dans un nom? L'univers est en expansion constante. Comment appelles-tu cela?
Was ist in einem Namen? Das Universum dehnt sich ständig aus. Wie nennt man es?
"""

with open('languages_test.txt', 'r', encoding='utf-8') as file:
    for line in file:
      print(predict_language(line.strip(), unigram_language_profiles))


bg
hu
pl
bg
ro
it
it
pt
fr
de
