<a href="https://colab.research.google.com/github/AsadiAhmad/Language-Identification/blob/main/Code/Language_Identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Import Libraries

In [122]:
import os
import re

import polars as pl
import numpy as np

from collections import Counter
from typing import Dict, Set, Tuple

import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Select Languages

1.   ara: Arabic
2.   nld: Dutch
3.   eng: English
4.   ita: Italian
5.   fra: French
6.   deu: German
7.   pes: Persian
8.   rus: Russian
9.   spa: Spanish
10.  tur: Turkish

In [66]:
languages = [
    "ara",
    "nld",
    "eng",
    "ita",
    "fra",
    "deu",
    "pes",
    "rus",
    "spa",
    "tur"
]

# Step 3: Download Dataset

In [3]:
!mkdir -p "dataset-bz2"

In [4]:
links = []
for language in languages:
    links.append(f"https://downloads.tatoeba.org/exports/per_language/{language}/{language}_sentences.tsv.bz2")

In [5]:
for link in links:
    !wget -P ./dataset-bz2/ {link}

--2025-05-21 17:13:04--  https://downloads.tatoeba.org/exports/per_language/ara/ara_sentences.tsv.bz2
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 743296 (726K) [application/octet-stream]
Saving to: ‘./dataset-bz2/ara_sentences.tsv.bz2’


2025-05-21 17:13:06 (717 KB/s) - ‘./dataset-bz2/ara_sentences.tsv.bz2’ saved [743296/743296]

--2025-05-21 17:13:06--  https://downloads.tatoeba.org/exports/per_language/nld/nld_sentences.tsv.bz2
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2376235 (2.3M) [application/octet-stream]
Saving to: ‘./dataset-bz2/nld_sentences.tsv.bz2’


2025-05-21 17:13:09 (1.60 MB/s) - ‘./dataset-bz2/nld_sentences.tsv.bz2’ sav

# Step 4: Unzip Dataset

In [6]:
# !rm -rf /content/dataset-btsv/

In [7]:
!mkdir -p "dataset-tsv"

In [8]:
for filename in os.listdir("./dataset-bz2"):
    input_path = os.path.join("dataset-bz2", filename)
    output_path = os.path.join("dataset-tsv", filename.replace(".bz2", ""))
    !bzip2 -dkc {input_path} > {output_path}

# Step 5: Convert TSV files to TXT files

In [9]:
!mkdir -p "dataset-txt"

In [10]:
for filename in os.listdir("./dataset-tsv"):
    input_path = os.path.join("dataset-tsv", filename)
    output_path = os.path.join("dataset-txt", filename.replace(".tsv", ".txt"))
    !cp {input_path} {output_path}

# Step 6: Convert text files into Polars frame

In [30]:
def convert_file_to_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    data = []
    for line in lines:
        parts = line.strip().split('\t')
        if len(parts) >= 3:
            language = parts[1]
            text = parts[2]
            data.append((language, text))
    return data

In [31]:
data = pl.DataFrame(
    {
        "language": pl.Series(dtype=pl.Utf8),
        "text": pl.Series(dtype=pl.Utf8)
    }
).with_row_index(name="index")

In [32]:
for filename in os.listdir("./dataset-txt"):
    input_path = os.path.join("dataset-txt", filename)
    file_data = convert_file_to_data(input_path)

    temp_dataframe = pl.DataFrame(
        {
            "language": [item[0] for item in file_data],
            "text": [item[1] for item in file_data]
        }
    )

    data = pl.concat([data, temp_dataframe], how="diagonal")

In [33]:
if "index" in data.columns:
    data = data.drop("index")
data = data.with_row_index(name="index", offset=0)

In [34]:
data.head(10)

index,language,text
u32,str,str
0,"""spa""","""¡Intentemos algo!"""
1,"""spa""","""Tengo que irme a dormir."""
2,"""spa""","""¿Qué estás haciendo?"""
3,"""spa""","""¿Qué es eso?"""
4,"""spa""","""¡Hoy es 18 de junio y es el cu…"
5,"""spa""","""¡Feliz cumpleaños, Muiriel!"""
6,"""spa""","""Ahora, Muiriel tiene 20 años."""
7,"""spa""","""La contraseña es ""Muiriel""."""
8,"""spa""","""Volveré pronto."""
9,"""spa""","""No tengo palabras."""


In [35]:
data.height

6854075

# Step 7: Preprocess

## Clean garbage characters

In [85]:
def clean_text(dataframe):
    chars_to_replace = ["！", "？", "｡", "。", "＂", "＃", "＄", "％", "＆", "＇",
                        "（", "）", "＊", "＋", "，", "－", "／", "：", "；", "＜",
                        "＝", "＞", "＠", "［", "＼", "］", "＾", "＿", "｀", "｛",
                        "｜", "｝", "～", "｟", "｠", "｢", "｣", "､", "、", "〃",
                        "》", "「", "」", "『", "』", "【", "】", "〔", "〕", "〖",
                        "〗", "〘", "〙", "〚", "〛", "〜", "〝", "〞", "〟", "〰",
                        "〾", "〿", "–", "—", "‘", "’", "‛", "“", "”", "„", "‟",
                        "…", "‧", "﹏", ".", ",", "(", ")", "{", "}", "[", "]",
                        "!", "@", "#", "$", "%", "^", "&", "*", "?", "_", "-",
                        "+", "=", "/", "`", "~", "'", ";", ":", "]", "¡", "\"",
                        "<", ">", "€", "𝑐", "π", "№", "―", "̀", "́", "×", "œ", "Œ",
                        "ŭ", "ɛ", "ɣ", "а", "ḍ", "ḥ", "ṛ", "ẓ", "መ", "ሀ", "ለ",
                        "ል", "ም", "ር", "ሰ", "በ", "ብ", "ት", "ኒ", "ን", "አ", "እ",
                        "ካ", "ክ", "ይ", "።", "ٱ", "ٰ", "ْ", "ٍ", "«", "»", "°", "¹",
                        "²", "³", "´", "§", "½", "√", "−", "ℝ", "𝑘", "𝑥", "₄",
                        "₂", "⁰", "›", "‹", "′", "¿", "º", "ま", "£", "¥", "か",
                        "の", "₺", "٪", "؟", "؛", "ѣ" ]
    pattern = "[" + re.escape("".join(chars_to_replace)) + "]"

    dataframe = dataframe.with_columns(
        pl.col("text")
        .str.replace_all(pattern, " ")  # Remove special characters
        .str.replace_all(r"\d", " ")    # Remove digits
        .str.replace_all(r"\s+", " ")   # Replace multiple whitespace with single space
        .str.strip_chars()              # Remove leading/trailing whitespace
    )
    return dataframe

In [86]:
cleaned_data = clean_text(data)

# Step 8: Split Train set and Validation set

In [106]:
def train_validation_split_by_language(data, samples_per_language=100):
    validation_set = (
        data
        .group_by("language")
        .map_groups(lambda group:
            group.sample(n=min(samples_per_language, len(group))))
    )

    training_set = data.join(
        validation_set.select("index"),
        on="index",
        how="anti"
    )

    if "index" in training_set.columns:
        training_set = training_set.drop("index")
    training_set = training_set.with_row_index(name="index", offset=0)

    if "index" in validation_set.columns:
        validation_set = validation_set.drop("index")
    validation_set = validation_set.with_row_index(name="index", offset=0)

    return training_set, validation_set

In [107]:
np.random.seed(42)
training_set, validation_set = train_validation_split_by_language(cleaned_data)

In [108]:
print(f"Training data shape: {training_set.shape}")
print(f"Validation data shape: {validation_set.shape}")
print("\nValidation samples per language:")
print(validation_set["language"].value_counts())

Training data shape: (6853075, 3)
Validation data shape: (1000, 3)

Validation samples per language:
shape: (10, 2)
┌──────────┬───────┐
│ language ┆ count │
│ ---      ┆ ---   │
│ str      ┆ u32   │
╞══════════╪═══════╡
│ eng      ┆ 100   │
│ ita      ┆ 100   │
│ pes      ┆ 100   │
│ deu      ┆ 100   │
│ tur      ┆ 100   │
│ spa      ┆ 100   │
│ ara      ┆ 100   │
│ fra      ┆ 100   │
│ nld      ┆ 100   │
│ rus      ┆ 100   │
└──────────┴───────┘


# Step 9: Filter all Language chracters

In [109]:
def create_language_chars_dictionary(
    data: pl.DataFrame,
    cutoff_thresholds: Dict[str, int] = None,
    default_cutoff: int = 10
) -> Tuple[Dict[str, Set[str]], Dict[str, Dict[str, int]]]:

    languages = data["language"].unique().to_list()
    char_frequencies = {lang: Counter() for lang in languages}
    language_chars = {lang: set() for lang in languages}

    for language, text in data.select(["language", "text"]).iter_rows():
        char_frequencies[language].update(text)

    for lang in languages:
        threshold = cutoff_thresholds.get(lang, default_cutoff)
        language_chars[lang] = {
            char for char, count in char_frequencies[lang].items()
            if count >= threshold
        }

    return language_chars, char_frequencies

In [114]:
cutoff_thresholds = {
    "ara": 100,
    "nld": 100,
    "eng": 100,
    "ita": 100,
    "fra": 100,
    "deu": 100,
    "pes": 100,
    "rus": 100,
    "spa": 100,
    "tur": 100
}

language_chars, char_frequencies = create_language_chars_dictionary(training_set, cutoff_thresholds=cutoff_thresholds, default_cutoff=100)

for lang, chars in language_chars.items():
    threshold = cutoff_thresholds.get(lang, 100)
    print(f"\nLanguage: {lang} (cutoff: {threshold})")
    print(f"Total unique chars: {len(char_frequencies[lang])}")
    print(f"Filtered chars count: {len(chars)}")
    print("Sample frequent chars:", sorted(list(chars))[:200])


Language: deu (cutoff: 100)
Total unique chars: 335
Filtered chars count: 64
Sample frequent chars: [' ', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xad', 'Ä', 'Ö', 'Ü', 'ß', 'ä', 'é', 'ö', 'ü', 'ō', '‚']

Language: ara (cutoff: 100)
Total unique chars: 183
Filtered chars count: 45
Sample frequent chars: [' ', '،', 'ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي', 'ً', 'ٌ', 'َ', 'ُ', 'ِ', 'ّ', 'ی']

Language: tur (cutoff: 100)
Total unique chars: 180
Filtered chars count: 65
Sample frequent chars: [' ', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'a', 'b', 'c', 'd', '

# Step 10: Predict Language by chars

In [112]:
def predict_language_by_chars(validation_set: pl.DataFrame, language_chars: dict[str, set[str]]) -> pl.DataFrame:
    lang_char_sets = {lang: set(chars) for lang, chars in language_chars.items()}
    languages = list(lang_char_sets.keys())

    text_chars_list = validation_set["text"].to_list()
    predictions = []

    for text in text_chars_list:
        text_chars = set(text)
        best_lang = max(
            languages,
            key=lambda lang: len(text_chars & lang_char_sets[lang])
        )
        predictions.append(best_lang)

    return validation_set.with_columns(
        predicted_language=pl.Series(predictions)
    )

In [115]:
validation_set = predict_language_by_chars(validation_set, language_chars)

In [116]:
validation_set.head(10)

index,language,text,predicted_language
u32,str,str,str
0,"""tur""","""O kadar da komik değildi""","""tur"""
1,"""tur""","""Avustralya da konuşulan dil İn…","""tur"""
2,"""tur""","""Bu fotoğrafı görünce ailemi dü…","""tur"""
3,"""tur""","""Burada İspanyolca konuşulur""","""tur"""
4,"""tur""","""Dinsiz piskoposun yorumuyla al…","""deu"""
5,"""tur""","""Üye misiniz""","""deu"""
6,"""tur""","""Yayan gittim""","""deu"""
7,"""tur""","""Bilim hakkında daha fazla bilm…","""tur"""
8,"""tur""","""Altın gümüşten daha ağırdır""","""tur"""
9,"""tur""","""Tom Mary ile konuşuyor""","""tur"""


# Step 11: Calculate Detailed Accuracy

In [132]:
def calculate_detailed_accuracy(df: pl.DataFrame) -> dict:
    confusion_counts = (
        df
        .group_by("language", "predicted_language")
        .agg(pl.len().alias("n"))
        .sort("language", "predicted_language")
    )

    class_acc = (
        df
        .group_by("language")
        .agg(
            (pl.col("language") == pl.col("predicted_language"))
            .mean()
            .alias("accuracy")
        )
        .sort("language")
    )

    class_acc_dict = {
        row["language"]: row["accuracy"]
        for row in class_acc.iter_rows(named=True)
    }

    return {
        "overall_accuracy": (df["language"] == df["predicted_language"]).mean(),
        "class_accuracy": class_acc_dict,
        "confusion_counts": confusion_counts
    }

In [133]:
metrics = calculate_detailed_accuracy(validation_set)

In [134]:
print(f"Overall Accuracy: {metrics['overall_accuracy']:.2%}")
print("\nPer-Class Accuracy:")
for lang, acc in metrics["class_accuracy"].items():
    print(f"{lang}: {acc:.2%}")

Overall Accuracy: 52.50%

Per-Class Accuracy:
ara: 100.00%
deu: 100.00%
eng: 0.00%
fra: 35.00%
ita: 3.00%
nld: 0.00%
pes: 81.00%
rus: 100.00%
spa: 23.00%
tur: 83.00%


# Step 12: