## **Identification de langue avec un modèle de Deep Learning et**
## **une couche d'embedding en input**

In [1]:
import numpy as np
import pandas as pd
import random
import joblib
import pickle
import json
import keras
import csv
import os
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

tf.config.set_visible_devices([], 'GPU')

# Choix de la Tokenisation (1 = Keras, 2 = BERT, 3 = Tiktoken)
sel_tokenization = 3

## Pour résoudre les problème de mémoire et de performances
max_length = 250

import warnings
warnings.filterwarnings('ignore')

#### **Lectures des phrases et de leur étiquette "Langue" pour les langues sélectionnées**

In [2]:
# Ouvrir le fichier d'entrée en mode lecture
def create_lang_df(path):
    df = pd.read_csv(path, index_col ='id')
    return df

def save_list_lan(lan_code):
    with open('../data/multilingue/lan_code.csv', 'w', newline='') as fichier_csv:
        writer = csv.writer(fichier_csv)
        writer.writerow(lan_code)
    
df = create_lang_df('../data/multilingue/sentences-big.csv')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
with open('../data/multilingue/lan_to_language.json', 'r') as fichier:
    lan_to_language = json.load(fichier)
n_rows = len(df)
print('Nombre de lignes de sentence.csv:',n_rows)
df

Nombre de lignes de sentence.csv: 10345978


Unnamed: 0,lan_code,sentence
0,ber,Yeqber uɛebbuḍ-iw seg wayen ččiɣ.
1,ita,Sono venute alla moschea.
2,heb,אני לא יכול לפענח מה קרה.
3,rus,У меня очень много дел.
4,ita,Andiamo a imparare qualcosa in Uganda.
...,...,...
10345973,deu,Wir werden das Problem nicht aufgreifen.
10345974,fra,Je suis cuit !
10345975,epo,Atomenergio estas sekura.
10345976,tok,o pana ala e moku tawa soweli tomo.


#### **Réalisation d'un jeu de données d'entrainement et de test**

In [3]:
# créer 2 dataframes: 1 train (95% des phrases) et 1 test (5% des phrases)
n_train = int(n_rows*0.95)
df_train = df.iloc[:n_train].sample(frac=1, random_state=42).reset_index(drop=True)
df_test = df.iloc[n_train:].sample(frac=1, random_state=24).reset_index(drop=True)
pd.set_option('display.max_colwidth', 150)
df_lan = pd.DataFrame(data= df.groupby('lan_code').size(), columns = ['nb_phrases_lang'] )

# Filtrage des langues qui ont peu de phrases (>2000)
df_lan = df_lan.loc[df_lan['nb_phrases_lang']>=2000]
list_lan = list(set(df_lan.index))
save_list_lan(list_lan)

df_train = df_train[df_train['lan_code'].isin(list_lan)]
df_test = df_test[df_test['lan_code'].isin(list_lan)]
print('df_train:')
display(df_train)
print('Nombre de langues à classer:',len(list_lan))
print('Nombre de lignes par langue:')
display(df_lan)

df_train:


Unnamed: 0,lan_code,sentence
0,deu,"Wie die Erde entstanden ist, das ist eine Frage, die sich allen stellt."
1,deu,Drinnen ist etwas Lebendiges.
2,rus,Я не хотела замуж.
3,kab,Ssarden-ak-t.
4,por,"Estou piscando para ele, mas ele não está olhando."
...,...,...
9828674,tat,"Йөрәген бүләк иткәннәр, кан әйләнешендә кыенлыклар барлыкка килсә, гаҗәпләнергә тиеш түгел."
9828675,ber,Cmumeḥ-d yid-i ma ulac aɣilif.
9828676,kab,Ceɛɛel-as ɣer Tasga Mellul.
9828677,eng,Tom can't stand vegetables.


Nombre de langues à classer: 95
Nombre de lignes par langue:


Unnamed: 0_level_0,nb_phrases_lang
lan_code,Unnamed: 1_level_1
afr,4137
ara,38651
arq,2336
asm,3205
avk,4102
...,...
war,2025
wuu,4757
yid,9632
yue,6230


#### **Selection du Tokenizer,**
#### **Encodage et padding du text avec le tokenizer**

In [4]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Selection du tokenizer
if sel_tokenization==3:
    import tiktoken
    tokenizer = tiktoken.get_encoding("cl100k_base")
elif sel_tokenization==2:
    from transformers import BertTokenizerFast
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-uncased')
else:
    from tensorflow.keras.preprocessing.text import Tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['sentence'])

# Données d'exemple (textes et leurs langues correspondantes)
textes = df_train['sentence']    
langues = df_train['lan_code']
    
# Encodage des étiquettes (langues)
label_encoder = LabelEncoder()
label_encoder.fit(list_lan)
labels_encoded = label_encoder.transform(langues)
    
def encode_text(textes):
    global max_length, nb_unique_tokens
    
    if sel_tokenization==3:
        sequences = tokenizer.encode_batch(textes)
        nb_unique_tokens = tokenizer.max_token_value + 1
    elif sel_tokenization==2:
        textes = textes.tolist()
        sequences = tokenizer.batch_encode_plus(textes).input_ids
        nb_unique_tokens = len(set(tokenizer.get_vocab()))
    else:
        sequences = tokenizer.texts_to_sequences(textes)
        nb_unique_tokens = len(tokenizer.word_index)
    return pad_sequences(sequences, maxlen=max_length, padding='post')

#### **Definition du modèle d'identification et encodage de l'ensemble Train**

In [5]:
# Padding des séquences
padded_sequences = encode_text(textes) # pad_sequences(sequences, maxlen=max_length, padding='post')
print("Nombre de tokens uniques :",nb_unique_tokens)
print("======")
    

Nombre de tokens uniques : 100277


In [6]:
# Conversion des étiquettes en catégories one-hot
labels_one_hot = to_categorical(labels_encoded)

# Création du modèle
model = Sequential()
model.add(Embedding(input_dim=nb_unique_tokens, output_dim=200, input_length=max_length))
model.add(GlobalAveragePooling1D())  
model.add(Dense(units = 400, activation = "tanh", kernel_initializer='glorot_uniform', name = "Dense_1"))
model.add(Dense(units = 200, activation = "tanh", kernel_initializer='glorot_uniform', name = "Dense_2"))
model.add(Dense(units = 100, activation = "tanh", kernel_initializer='glorot_uniform', name = "Dense_3"))
model.add(Dense(units = 100, activation = "tanh", kernel_initializer='glorot_uniform', name = "Dense_4"))
model.add(Dense(len(df_lan), activation='softmax')) 

# Compilation du modèle
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 200)          20055400  
                                                                 
 global_average_pooling1d (G  (None, 200)              0         
 lobalAveragePooling1D)                                          
                                                                 
 Dense_1 (Dense)             (None, 400)               80400     
                                                                 
 Dense_2 (Dense)             (None, 200)               80200     
                                                                 
 Dense_3 (Dense)             (None, 100)               20100     
                                                                 
 Dense_4 (Dense)             (None, 100)               10100     
                                                        

#### **Entraînement du modèle**

In [7]:
#### Entraînement du modèle
stop_early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True,)
checkpoint = ModelCheckpoint('best_model.h5',
                             monitor='val_accuracy',
                             save_best_only=True,
                             # save_weights_only=True,
                             mode='max',
                             verbose=1)
model.fit(padded_sequences, labels_one_hot, epochs=40, validation_split=0.1, batch_size=1024, verbose=1, callbacks=[stop_early,checkpoint])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40


<keras.callbacks.History at 0x2c8d8252a00>

#### **Sauvegarde et/ou Chargement du modele**

In [5]:
# definition du nom de fichier de sauvegarde
def get_file_name(sel_tokenization):
    if sel_tokenization == 3: return("../data/dl_tiktoken_id_language_model_big.h5")
    elif sel_tokenization == 2: return("../data/dl_BERT_id_language_model_big.h5")
    else: return("../data/dl_default_id_language_model_big.h5")


# Sauvegarde du modèle entrainé
# model.save(get_file_name(sel_tokenization))

# Voici une instruction qui permet de passer sous la barre des 100 Mo.....
# from filesplit.split import Split
# Split(get_file_name(sel_tokenization),"../data/dl_id_lang_split").bysize(66846720)
if sel_tokenization==1:
    with open('../data/tokenizer_Keras.pkl', 'wb') as tokenizer_file:
        pickle.dump(tokenizer, tokenizer_file)


# Chargement du tokenizer Keras pré-entrainé
from filesplit.merge import Merge
merge = Merge("../data/dl_id_lang_split",  "../data", get_file_name(sel_tokenization)).merge(cleanup=False)
model = keras.models.load_model(get_file_name(sel_tokenization))
if sel_tokenization==1:
    with open('../data/tokenizer_Keras.pkl', 'rb') as tokenizer_file:
        tokenizer = pickle.load(tokenizer_file)


In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 200)          20055400  
                                                                 
 global_average_pooling1d (G  (None, 200)              0         
 lobalAveragePooling1D)                                          
                                                                 
 Dense_1 (Dense)             (None, 400)               80400     
                                                                 
 Dense_2 (Dense)             (None, 200)               80200     
                                                                 
 Dense_3 (Dense)             (None, 100)               20100     
                                                                 
 Dense_4 (Dense)             (None, 100)               10100     
                                                        

#### **Test de l'efficacité du modèle**

In [7]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
import random

# Préparation des nouvelles données à prédire
textes_test = df_test['sentence']
langues_test = df_test['lan_code']

# Prédiction des langues des nouveaux textes
predictions = model.predict(encode_text(textes_test))

# Décodage des prédictions en langues
predicted_labels_encoded = np.argmax(predictions, axis=1)
predicted_languages = label_encoder.classes_[predicted_labels_encoded]
print("======")

print(classification_report(langues_test,predicted_languages))
print("======")

ct = pd.crosstab(langues_test,predicted_languages,rownames=['Classe réelle'], colnames=['Classe prédite'])
display(ct)
accuracy_clf = accuracy_score(langues_test,predicted_languages)
print("Accuracy Classifier = {:.3f}".format(accuracy_clf))
print("======")


              precision    recall  f1-score   support

         afr       0.92      0.83      0.88       218
         ara       0.98      0.99      0.98      1930
         arq       0.88      0.65      0.75       112
         asm       0.95      0.84      0.90       173
         avk       0.91      0.76      0.83       207
         aze       0.98      0.84      0.90       273
         bel       0.99      0.81      0.89       609
         ben       0.91      0.98      0.94       266
         ber       0.88      0.91      0.90     29271
         bre       0.95      0.86      0.90       380
         bul       0.90      0.81      0.85      1291
         cat       0.91      0.81      0.86       426
         cbk       0.86      0.80      0.83       115
         ces       0.97      0.96      0.96      3159
         ckb       0.99      0.99      0.99       494
         cmn       0.98      0.98      0.98      3625
         cor       0.98      0.93      0.95       220
         dan       0.91    

Classe prédite,afr,ara,arq,asm,avk,aze,bel,ben,ber,bre,...,uig,ukr,urd,vie,vol,war,wuu,yid,yue,zsm
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
afr,182,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
ara,0,1912,9,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
arq,0,34,73,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
asm,0,0,0,146,0,0,0,27,0,0,...,0,0,0,0,0,0,0,0,0,0
avk,0,0,0,0,158,0,0,0,3,3,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
war,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,92,0,0,0,0
wuu,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,200,0,5,0
yid,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,440,0,0
yue,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,288,0


Accuracy Classifier = 0.975


In [12]:
ct_weurope = pd.DataFrame(ct)
lignes_a_sommer = ct.index.difference(['eng','deu','fra','ita','spa'])
somme_l = ct.loc[lignes_a_sommer].sum()
ct_weurope.loc['etc'] = somme_l
ct_weurope = ct_weurope.drop(index = ct_weurope.index.difference(['eng','deu','fra','ita','spa','etc']))

colonnes_a_sommer = ct_weurope.columns.difference(['eng','deu','fra','ita','spa'])
somme_c = ct_weurope[colonnes_a_sommer].sum(axis=1)
ct_weurope['etc']= somme_c
ct_weurope = ct_weurope.drop(columns=ct_weurope.columns.difference(['eng','deu','fra','ita','spa','etc']))
display(ct_weurope)
accuracy_weurope = (ct_weurope['eng']['eng']+ct_weurope['deu']['deu']+ct_weurope['fra']['fra']+ct_weurope['ita']['ita']+ct_weurope['spa']['spa'])/(ct_weurope.sum().sum()-ct_weurope['etc']['etc'])
print("Accuracy pour les langues d'Europe de l'Ouest = {:.3f}".format(accuracy_weurope))

Classe prédite,deu,eng,fra,ita,spa,etc
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
deu,29225,10,5,3,1,131
eng,2,79035,8,6,6,70
fra,1,4,24864,17,6,45
ita,0,1,10,40442,20,103
spa,0,0,4,44,18322,139
etc,39,84,104,302,358,320154


Accuracy pour les langues d'Europe de l'Ouest = 0.992


#### **Affichage d'exemples**

In [13]:
# Affichage des prédictions
print("Exemples de prédiction de langue:")
print("Réelle\t- Prédite - Texte")
n_test = min(len(textes_test),10)
for _ in range(n_test):
    i = random.randint(0, len(textes_test))
    print(f" {langues_test.iloc[i]}\t- {predicted_languages[i]}     -'{textes_test.iloc[i].ljust(120, '.')[:120]}'")

Exemples de prédiction de langue:
Réelle	- Prédite - Texte
 ita	- ita     -'Sto leggendo il libro 'La lingua pericolosa' di Ulrich Lins.............................................................'
 lit	- lit     -'Taip, tai yra labai gražu. Kiek tai kainuoja?...........................................................................'
 eng	- eng     -'The man gave a big cry..................................................................................................'
 tur	- tur     -'Burada bekle. Ben kısa zamanda dönerim..................................................................................'
 ita	- ita     -'Perché mi sospetti?.....................................................................................................'
 por	- por     -'Não sei a quem o darei..................................................................................................'
 rus	- rus     -'Отведите их туда..........................................................................

In [14]:
# Affichage de mauvaises prédictions
print("Exemples de mauvaises prédictions de langue:")
list_bad = []
n = len(textes_test)
if n>0:
    for i in range(n):
        if predicted_languages[i] != langues_test.iloc[i] :
            list_bad.append(i)
    print("Réelle\t- Prédite - Texte")
    n_test = min(n,10)
    for _ in range(n_test):
        i = random.randint(0, len(list_bad))
        print(f" {langues_test.iloc[list_bad[i]]}\t- {predicted_languages[list_bad[i]]}     -'{textes_test.iloc[list_bad[i]].ljust(120, '.')[:120]}'")
else:
    print("Félicitations !!!! Le modèle n'a fait aucune mauvaise prédictions.")

Exemples de mauvaises prédictions de langue:
Réelle	- Prédite - Texte
 deu	- dan     -'Hilft Tom Mary?.........................................................................................................'
 ber	- kab     -'Iruka-a n yemma.........................................................................................................'
 kab	- ber     -'Lliɣ ttnaɣeɣ d yiḍ, tawla, iḍes asemmiḍ.................................................................................'
 por	- glg     -'É esta a chave que estás buscando?......................................................................................'
 slk	- ces     -'Je odo mňa o hodne vyššia...............................................................................................'
 ota	- uig     -'اونڭ بابه سى وار........................................................................................................'
 kab	- ber     -'Σerḍeɣ-t-id maca ɣas akken, ur d-yusa ara......................................

In [74]:
def lang_id_dl(sentences):
    
    if "str" in str(type(sentences)): predictions = model.predict(encode_text([sentences]))
    else:  predictions = model.predict(encode_text(sentences))
    # Décodage des prédictions en langues
    predicted_labels_encoded = np.argmax(predictions, axis=1)
    predicted_languages = label_encoder.classes_[predicted_labels_encoded]
    if "str" in str(type(sentences)): return lan_to_language[predicted_languages[0]]
    else: return [l for l in predicted_languages]

lang_id_dl("Afin de mettre en oeuvre cette fonctionnalité ")



'French'

In [21]:
from sklearn.metrics import accuracy_score
import time

y_ext_actual = []
y_dl_predicted=[]
dict_xlmr  = {"ar":"ara", "bg":"bul", "de":"deu", "el": "ell", "en":"eng", "es":"spa", "fr":"fra", "hi": "hin","it":"ita","ja":"jpn", \
              "nl":"nld", "pl":"pol", "pt":"por", "ru":"rus", "sw":"swh", "th":"tha", "tr":"tur", "ur": "urd", "vi":"vie", "zh":"cmn"}
lang_available = list(dict_xlmr.values())
start_time = time.time()
j= 0
for i in range(len(df_test)):
    if df_test.lan_code.iloc[i] in lang_available:
        y_ext_actual.append(df_test.lan_code.iloc[i])
        y_dl_predicted.append(predicted_languages[i])
        if (i-j)>=10000:
            j = (i//10000)*10000
            d = (time.time()- start_time)
            print("no",j," - ",d,"s (",d/len(y_ext_actual),"s/id )          ",end="\r")
            
end_time = time.time()
duration = end_time - start_time
print("")
print("Nombre de phrases prises en compte :",len(y_ext_actual))
print("Durée de traitement :",duration,"secondes, c'est à dire ", duration/len(y_ext_actual),"s/identification")
print("Accuracy du Deep Learnings :",accuracy_score(y_ext_actual, y_dl_predicted))

no 510000  -  9.855663537979126 s ( 3.0108154584438068e-05 s/id )           
Nombre de phrases prises en compte : 329683
Durée de traitement : 9.925775289535522 secondes, c'est à dire  3.0107027931484252e-05 s/identification
Accuracy du Deep Learnings : 0.992441223842297
