# 	The	language classification	problem

Paper experiment setup: While in this study, in order to enable training of SOMs, each language corpus was divided into samples where the length of each sample was set to 1, 000 symbols. The data for each language was preprocessed such that the text included only lower case letters and spaces. All punctuation was removed. Lastly, all text used the 26-letter ISO basic Latin alphabet, i.e., the alphabet for both training and test data was the same and it included 27 symbols. 

In [1]:
import re
import numpy as np
import os
import random
import pickle
from collections import defaultdict

In [2]:
alphabet = 'abcdefghijklmnopqrstuvwxyz '  # 27 symbols including space
alphabet_size = len(alphabet)
hd_dimension1 = 100  
hd_dimension2 = 1000

In [3]:
def ngrams(text, n=3):
    ngrams = []
    for i in range(len(text) - n + 1):
        ngram = text[i:i + n]
        ngrams.append(ngram)
    
    return ngrams


In [4]:
def preprocess(text):
    # Lowercase the text
    text = text.lower()
    text = re.sub(r'[^a-z ]', '', text)
    return text

example1 = '4	Ah, dernier truc : l’idéal est de vider la cuisine d’un trait en prenant bien le temps de ranger et non pas petit à petit mais dans l’urgence comme j’ai fait car cela devient vite le bazar.'
processed_text1 = preprocess(example1)

print("Original text:")
print(example1)
print("\nPreprocessed text:")
print(processed_text1)


Original text:
4	Ah, dernier truc : l’idéal est de vider la cuisine d’un trait en prenant bien le temps de ranger et non pas petit à petit mais dans l’urgence comme j’ai fait car cela devient vite le bazar.

Preprocessed text:
ah dernier truc  lidal est de vider la cuisine dun trait en prenant bien le temps de ranger et non pas petit  petit mais dans lurgence comme jai fait car cela devient vite le bazar


In [5]:
ngrams_example1 = ngrams(processed_text1)

In [None]:
def split_data(file_path, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    with open(file_path, 'r') as file:
        sentences = file.readlines()

    random.shuffle(sentences)

    total_sentences = len(sentences)
    train_end = int(train_ratio * total_sentences)
    val_end = train_end + int(val_ratio * total_sentences)

    train_sentences = sentences[:train_end]
    val_sentences = sentences[train_end:val_end]
    test_sentences = sentences[val_end:]

    base, ext = os.path.splitext(file_path)
    with open(f"{base}_train{ext}", 'w') as train_file:
        train_file.writelines(train_sentences)
    with open(f"{base}_val{ext}", 'w') as val_file:
        val_file.writelines(val_sentences)
    with open(f"{base}_test{ext}", 'w') as test_file:
        test_file.writelines(test_sentences)



In [None]:
def ngram_stat(ngrams):
    s_i = np.zeros(alphabet_size ** n)

    # Create a mapping from n-gram to index in vector s
    ngram_to_index = defaultdict(lambda: len(ngram_to_index))
    
    for ngram in ngrams:
        index = ngram_to_index[ngram]
        s_i[index] += 1

    return s_i  

In [None]:
def create_dataset_stat(folder_path):
    dataset = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            sentences = read_sentences(file_path)
            for sentence in sentences:
                preprocessed_sentence = preprocess(sentence)
                ngrams_sentence = ngrams(preprocessed_sentence)
                s_i = ngram_sta(ngrams_sentence)
            dataset[filename] = s_i  

    return dataset

# Hyperdimensional Centroid Language classification

In [6]:
def initialize_hd_vectors(ngrams, d):
    hd_vectors = {}
    for ngram in ngrams:
        hd_vectors[ngram] = np.random.choice([-1, 1], size=(d,))
    return hd_vectors


In [7]:
def init_memory(alphabet_size, hd_dimension):
    H = np.random.choice([-1, 1], size=(hd_dimension, alphabet_size))
    return H

def get_hd_vector(symbol_index, item_memory):
    # Retrieve the HD vector for the given symbol index from item memory H
    return item_memory[:, symbol_index]


In [8]:
# Initialize item memory
item_memory = init_memory(alphabet_size, hd_dimension1)

# Retrieve HD vector for a specific symbol (e.g., 'a')
symbol_index = alphabet.index('a')
hd_vector_a = get_hd_vector(symbol_index, item_memory)

print(f"Item memory (H):\n{item_memory.shape}")
print(f"\nHD vector for symbol 'a' (H[a]):\n{hd_vector_a.shape}")

Item memory (H):
(100, 27)

HD vector for symbol 'a' (H[a]):
(100,)


In [9]:
def rho(hd_vector, times, shift=5):
    if times == 0:
        return hd_vector
    else:
        # Rotate the hd_vector by 'shift' positions
        return rho(np.roll(hd_vector, shift), shift, times - 1)


# forming HD vector of an n-gram

From paper: for the trigram ‘cba’ will be mapped to its HD vector as follows:
ρ
1
(Hc)  ρ
2
(Hb)  ρ
3
(Ha) the process of forming HD vector of an n-gram can be formalized as follows

In [10]:
def ngram_HD(ngram, hd_dimension):
    ngram_hd = np.ones((hd_dimension,) )
    for i in range(len(ngram)):
        sym_idx = alphabet.index(ngram[i])
        hd_vector = get_hd_vector(sym_idx, item_memory)
        # apply i+1 times rho
        hd_vector = rho(hd_vector, i+1)
        ngram_hd = ngram_hd*hd_vector
    return ngram_hd

ngram_example = 'cba'
example_output = ngram_HD(ngram_example, 100)
print('cab hd vector:', example_output)

cab hd vector: [-1. -1.  1. -1.  1.  1.  1.  1. -1. -1.  1. -1. -1. -1. -1.  1. -1. -1.
  1. -1. -1.  1. -1. -1. -1.  1.  1. -1. -1. -1. -1. -1.  1. -1. -1. -1.
  1.  1.  1. -1. -1. -1.  1.  1. -1. -1. -1.  1.  1. -1.  1. -1.  1.  1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1.  1.  1.  1.  1.  1.  1.  1. -1. -1.
 -1.  1. -1. -1.  1.  1. -1.  1. -1. -1. -1. -1. -1.  1. -1. -1.  1.  1.
 -1.  1. -1. -1. -1. -1.  1. -1.  1. -1.]


# mapping the whole n-gram statistics s

In [11]:
def ngram_stat_h(ngrams, hd_dimension, n=3, alphabet_size=alphabet_size):

    h = np.zeros((hd_dimension,))
    # Initialize vector s
    num_ngrams = alphabet_size ** n
    s_i = np.zeros(num_ngrams)

    # Create a mapping from n-gram to index in vector s
    ngram_to_index = defaultdict(lambda: len(ngram_to_index))
    
    for ngram in ngrams:
        index = ngram_to_index[ngram]
        s_i[index] += 1
        
    for ngram in ngrams:    
        ngram_hd = ngram_HD(ngram, hd_dimension)
        h += s_i[ngram_to_index[ngram]]*ngram_hd
    return h



In [12]:
example1_h_stat = ngram_stat_h(ngrams_example1, hd_dimension1)
print('a short sentence example n-gram statistics:\n', example1_h_stat)

a short sentence example n-gram statistics:
 [ 12. -22.  18.  -4. -22. -16.  18.  12.  32.  12.  -2.  -6.   4.  36.
 -22. -44. -26. -28.  42.   8.  10. -32. -20.  30. -26.   8.   6.  24.
 -16.  -6.  26.   8. -22.   8. -38.  52.  28.  18.  24.  10. -12.  -8.
  12.  34. -48.   4. -14.  10.   4.  -2.   0.  38.  44.  -6.  18. -18.
 -18.  28. -54.  -2.  22.  14. -34. -18.  20. -30.  42. -14. -16.   0.
 -22. -24.   2. -30.  18.  -8.  18.  -2.   4.  -6. -10. -22.  34.  36.
  30. -28.   2.  -2. -54.  12.  12.  46.  12.  54. -44.  12.  20.  24.
  48.  60.]


In [13]:
l2_norm_example1 = np.linalg.norm(example1_h_stat)

normalized_array = example1_h_stat / l2_norm_example1

print("\nL2 norm of the array:")
print(l2_norm_example1)
print("\nNormalized array:")
print(normalized_array)



L2 norm of the array:
253.7794317906792

Normalized array:
[ 0.04728516 -0.08668945  0.07092773 -0.01576172 -0.08668945 -0.06304687
  0.07092773  0.04728516  0.12609375  0.04728516 -0.00788086 -0.02364258
  0.01576172  0.14185547 -0.08668945 -0.17337891 -0.10245117 -0.11033203
  0.16549805  0.03152344  0.0394043  -0.12609375 -0.07880859  0.11821289
 -0.10245117  0.03152344  0.02364258  0.09457031 -0.06304687 -0.02364258
  0.10245117  0.03152344 -0.08668945  0.03152344 -0.14973633  0.20490234
  0.11033203  0.07092773  0.09457031  0.0394043  -0.04728516 -0.03152344
  0.04728516  0.13397461 -0.18914062  0.01576172 -0.05516602  0.0394043
  0.01576172 -0.00788086  0.          0.14973633  0.17337891 -0.02364258
  0.07092773 -0.07092773 -0.07092773  0.11033203 -0.2127832  -0.00788086
  0.08668945  0.05516602 -0.13397461 -0.07092773  0.07880859 -0.11821289
  0.16549805 -0.05516602 -0.06304687  0.         -0.08668945 -0.09457031
  0.00788086 -0.11821289  0.07092773 -0.03152344  0.07092773 -0.0

# 21 languages centroid HD

In [14]:
def read_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    return sentences

In [43]:
def pick_random_sentences(sentences, num_sentences):
    return random.sample(sentences, min(num_sentences, len(sentences)))

In [46]:
folder_path = './news'

def HD_centroid_1(hd_dimension):
    # Initialize item memory
    item_memory = init_memory(alphabet_size, hd_dimension)

    HD_centroids = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                preprocessed_text = preprocess(text)
                ngrams_text = ngrams(preprocessed_text)
                h_stat = ngram_stat_h(ngrams_text, hd_dimension)
                # L2 normalize h
                l2_norm = np.linalg.norm(h_stat)
                normed_h = h_stat / l2_norm
                HD_centroids[filename.split('_')[0]] = normed_h
    return HD_centroids

def HD_centroid(hd_dimension):
    # Initialize item memory
    item_memory = init_memory(alphabet_size, hd_dimension)

    HD_centroids = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            centroid = []
            file_path = os.path.join(folder_path, filename)
            sentences = read_sentences(file_path)
            sentences = pick_random_sentences(sentences, 1000)
            for sentence in sentences:
                preprocessed_sentence = preprocess(sentence)
                ngrams_sentence = ngrams(preprocessed_sentence)
                h_stat = ngram_stat_h(ngrams_sentence, hd_dimension)
                # L2 normalize h
                epsilon = 1e-4
                l2_norm = np.linalg.norm(h_stat)
                normed_h = h_stat / (l2_norm + epsilon)
                centroid.append(normed_h)
            mean = np.mean(np.array(centroid), axis=0)
            HD_centroids[filename.split('_')[0]] = mean
            print(f'Language: {filename.split('_')[0]} HD centroid is generated')
    return HD_centroids

In [47]:
HD_centroids_100dimension = HD_centroid(hd_dimension1)

Language: bul HD centroid is generated
Language: ces HD centroid is generated
Language: dan HD centroid is generated
Language: deu HD centroid is generated
Language: ell HD centroid is generated
Language: eng HD centroid is generated
Language: est HD centroid is generated
Language: fin HD centroid is generated
Language: fra HD centroid is generated
Language: hun HD centroid is generated
Language: ita HD centroid is generated
Language: lav HD centroid is generated
Language: lit-lit HD centroid is generated
Language: nld HD centroid is generated
Language: pol HD centroid is generated
Language: por HD centroid is generated
Language: ron HD centroid is generated
Language: slk HD centroid is generated
Language: slv HD centroid is generated
Language: spa HD centroid is generated
Language: swe HD centroid is generated


In [48]:
with open('HD_centroids_100dimension.pkl', 'wb') as f:
    pickle.dump(HD_centroids_100dimension, f)

print("Dictionary saved to HD_centroids_100dimension.pkl")


Dictionary saved to HD_centroids_100dimension.pkl


In [49]:
print('Italian HD cenroid vector:\n')
HD_centroids_100dimension['ita']

Italian HD cenroid vector:



array([-9.44645268e-04,  1.60869581e-02,  3.00033016e-02,  5.16537924e-02,
        3.75799441e-03, -1.17678688e-02,  3.05207273e-02, -1.07287779e-02,
       -2.97417883e-02, -1.13961939e-02,  3.99341935e-04,  3.16737991e-02,
        6.00054670e-03,  3.95645109e-02, -6.00400523e-02, -6.88222889e-02,
        1.60882953e-02,  3.51360092e-02,  7.20531819e-02,  8.35000324e-03,
        1.26103393e-02, -3.49147394e-02, -1.36873093e-05, -4.61239159e-02,
       -9.36766483e-03, -2.70180106e-02, -2.90798384e-02, -1.22021627e-02,
       -1.60786985e-02,  6.34481904e-02, -7.87592577e-02, -1.18824795e-02,
       -3.23104839e-02, -4.93591628e-03,  6.91565511e-03,  7.08775695e-02,
        3.50864906e-02, -9.30714308e-03,  2.45613731e-02,  5.38381264e-02,
        6.68466700e-02, -5.92544510e-03,  1.14969262e-03,  5.68186525e-02,
       -1.61811860e-02,  1.16903657e-02,  8.48007083e-02,  4.52082478e-02,
       -1.57128743e-02, -2.51552098e-02,  3.61076571e-02, -2.62413438e-02,
        3.24842403e-02,  

# Test phase using Europarl Parallel Corpus

In [50]:
# Load the dictionary from the file
with open('HD_centroids_100dimension.pkl', 'rb') as f:
    HD_centroids_100dimension = pickle.load(f)




In [51]:
test_path = './finaltest'
classes = HD_centroids_100dimension.keys()

def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

def classify(file_path, centroids, hd_dimension):
    sentences = read_sentences(file_path)
    sentences = pick_random_sentences(sentences, num_sentences=100)
    confusion_matrix = {}
    
    for sentence in sentences:
        preprocessed_sentence = preprocess(sentence)
        ngrams_sentence = ngrams(preprocessed_sentence)
        h_stat = ngram_stat_h(ngrams_sentence, hd_dimension)
        # L2 normalize h
        l2_norm = np.linalg.norm(h_stat)
        epsilon = 1e-4
        normed_h = h_stat / (l2_norm + epsilon)
        similarity = {}
        for label, centroid in centroids.items():
            similarity[label] = cosine_similarity(normed_h, centroid)
        pred = max(similarity, key=similarity.get)
        try:
            confusion_matrix[pred] += 1
        except KeyError:
            confusion_matrix[pred] = 0
    return confusion_matrix
        
    
    
def cosine_classify(test_path, centroids, hd_dimension):

    for filename in os.listdir(test_path):
        file_path = os.path.join(test_path, filename)
        sentences = read_sentences(file_path)
        sentences = pick_random_sentences(sentences, num_sentences=100)
        
        for sentence in sentences:
            similarity = {}
            preprocessed_sentence = preprocess(sentence)
            ngrams_sentence = ngrams(preprocessed_sentence)
            h_stat = ngram_stat_h(ngrams_sentence, hd_dimension)
            # L2 normalize h
            l2_norm = np.linalg.norm(h_stat)
            epsilon = 1e-4
            normed_h = h_stat / (l2_norm + epsilon)
            
            for label, centroid in centroids.items():
                similarity[label] = cosine_similarity(normed_h, centroid)
    return similarity


In [52]:
for filename in os.listdir(test_path):
        file_path = os.path.join(test_path, filename)
swedish_test = classify(file_path, HD_centroids_100dimension, hd_dimension1)

  return dot_product / (norm_a * norm_b)


In [53]:
a = cosine_classify(test_path, HD_centroids_100dimension, hd_dimension1)

In [54]:
swedish_test

{'swe': 67,
 'dan': 12,
 'nld': 5,
 'por': 0,
 'bul': 0,
 'deu': 0,
 'ron': 1,
 'lav': 0,
 'ces': 0,
 'fra': 0,
 'eng': 2,
 'spa': 0,
 'pol': 0}