In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

#### Fetching data

In [10]:
url_dataset = 'https://drive.google.com/file/d/1V3WrHKzRewBwvo9AOL7Qz6V3PnwuHtD9/view?usp=drive_link'
file_id = url_dataset.split('/')[-2]

dwn_url = 'https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(dwn_url)

In [11]:
df.shape

(27757, 5)

In [12]:
df.head()

Unnamed: 0,id,sentence,iso639-3,iso15924,language
0,tir_Ethi,ሕድሕድ ኦብ ስራሕ ዝርከብ ሰብ ሰብኦዊ ክብሩን ክብሪ ቤተሰቡን ዝሕለወሉ ...,tir,Ethi,Tigrinya
1,tir_Ethi,ሕድሕድ ሰብ ከቢድ ስቓይ ዘውርድ፣ ጭካነ ዝመልኦ፣ ኢሰብኦዊ ወይ ሰብኦዊ ...,tir,Ethi,Tigrinya
2,tir_Ethi,ሕድሕድ ሰብ መሰሉን ግቡእን፣ ከምኡ’ውን ንዝቐርበሉ ዝኾነ ይኹን ገበናዊ ...,tir,Ethi,Tigrinya
3,tir_Ethi,ንዓቕመ ኦዳምን ሄዋንን ዝበፅሑ ደቂ ተባዕትዮን ደቂ ኦንስትዮን ዘርኢ፣ ዜ...,tir,Ethi,Tigrinya
4,tir_Ethi,ሕድሕድ ሰብ ብማሕበር ንኽውደብ ኦይግደድን፡፡,tir,Ethi,Tigrinya


In [13]:
df.rename(columns = {'sentence':'Text', 'language':'Language'}, inplace=True)

In [14]:
df = df[['Text', 'Language']]

In [15]:
df['Language'].value_counts()

Language
Romansh                        399
Occitan (post 1500)            398
Mandarin Chinese               363
Karelian                       183
Chimborazo Highland Quichua    182
                              ... 
Ese Ejja                        35
Chuvash                         25
Tzeltal                         25
Central Atlas Tamazight          1
Eastern Tamang                   1
Name: count, Length: 419, dtype: int64

#### Cleaning text - removing special chars, numbers and spaces

In [16]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', '', text)
    return text

In [17]:
texts = df['Text']
language = df['Language']

# texts = df['sentence']
# language = df['language']

texts = [preprocess_text(text) for text in texts]

In [18]:
language.unique()

array(['Tigrinya', 'Balkan Romani', 'Standard Arabic',
       'Metlatónoc Mixtec', 'Malayalam', 'Fijian', 'Somali', 'Caquinte',
       'Friulian', 'Vietnamese', 'Malay (individual language)', 'Bambara',
       'Cherokee', 'Central Mazahua', 'Yagua', 'Güilá Zapotec',
       'Northern Yukaghir', 'Chakma', 'Southern Altai', 'Central Aymara',
       'Ao Naga', 'Baoulé', 'Guarayu', 'Rundi', 'Hawaiian', 'Romagnol',
       'Kaqchikel', 'Awa-Cuaiquer', 'French', 'Aguaruna', 'Drung',
       'Iloko', 'Central Nahuatl', 'Tem', 'Hakha Chin', 'Tibetan',
       'Burmese', 'Adyghe', 'Polish', 'Eastern Yiddish', 'Corsican',
       'Otuho', 'Arabela', 'Manx', 'Gagauz', 'Bari', 'Afrikaans',
       'Ligurian', 'Ibibio', 'Tonga (Zambia)', 'Kabyle', 'Romanian',
       'Northwestern Ojibwa', 'Sanskrit', 'English', 'Bulu (Cameroon)',
       'Pampanga', 'Northern Kissi', 'Zarma', 'Waorani', 'Samoan',
       'Portuguese', 'Western Frisian', 'Ladino', 'Upper Guinea Crioulo',
       'Tuvinian', 'Wayuu', 'Murui H

In [9]:
# vectorizer = TfidfVectorizer(ngram_range=(2, 5), analyzer='char')
# vectorizer = TfidfVectorizer()
# texts_tfidf = vectorizer.fit_transform(texts)
# X_test_tfidf = vectorizer.transform(X_test)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(texts, language, test_size=0.2, random_state=42)

#### Converting text to numerical features

In [20]:
# vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')
# # vectorizer = TfidfVectorizer()
# X_train_tfidf = vectorizer.fit_transform(X_train)
# X_test_tfidf = vectorizer.transform(X_test)

In [21]:
# X_train_tfidf = vectorizer.fit_transform(X_train)
# X_test_tfidf = vectorizer.transform(X_test)

In [22]:
# print(vectorizer.get_feature_names_out())

In [23]:
# print(X_train_tfidf.toarray())

In [24]:
param_grid = {
    'vectorizer__ngram_range': [(1,1), (1,2), (2,3)],  # Unigrams, bigrams, char-level ngrams
    'vectorizer__analyzer': ['char', 'word'],
    # 'vectorizer__max_df': [0.7, 0.9],  # Ignore overly common words
    # 'vectorizer__min_df': [1, 2],  # Ignore rare words
    # 'vectorizer__use_idf': [True, False],  # With/without inverse document frequency
    'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100]  # Laplace smoothing parameter
}
cv = StratifiedKFold(n_splits=5, shuffle=True)

In [25]:
pipeline = Pipeline(steps=[('vectorizer', TfidfVectorizer()), ('model', MultinomialNB())])
# vectorizer = TfidfVectorizer()
# X_train_tfidf = vectorizer.fit_transform(X_train)
# X_test_tfidf = vectorizer.transform(X_test)

In [26]:
# model = GridSearchCV(estimator=MultinomialNB(), param_grid=param_grid, cv=cv, scoring='accuracy', verbose=2)
model = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=cv, scoring='accuracy', verbose=2)

In [27]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits




[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 1); total time=   0.8s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 1); total time=   1.1s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 1); total time=   1.2s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 1); total time=   1.1s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 1); total time=   1.1s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 2); total time=   4.8s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 2); total time=   3.9s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 2); total time=   3.9s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 2); total time=   4.1s
[CV] END model__alpha=0.001, vectoriz

30 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hiami\anaconda3\envs\study\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hiami\anaconda3\envs\study\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hiami\anaconda3\envs\study\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [28]:
# Print best parameters and accuracy
print(f"Best Parameters: {model.best_params_}")
print(f"Best Accuracy: {model.best_score_:.4f}")
best_model = model.best_estimator_
best_model

Best Parameters: {'model__alpha': 0.001, 'vectorizer__analyzer': 'char', 'vectorizer__ngram_range': (2, 3)}
Best Accuracy: 0.9565


In [29]:
# y_pred = model.predict(X_test_tfidf)
y_pred = model.predict(X_test)
# y_pred = le.inverse_transform(y_pred)

In [30]:
accuracy_score(y_test, y_pred)*100

94.59654178674351

In [31]:
print(classification_report(y_test, y_pred))

                                                            precision    recall  f1-score   support

                                                 Abkhazian       1.00      1.00      1.00        12
                                                  Achinese       0.90      1.00      0.95         9
                                            Achuar-Shiwiar       1.00      0.93      0.96        41
                                                   Adangme       1.00      1.00      1.00        13
                                                    Adyghe       1.00      1.00      1.00        10
                                                      Afar       1.00      1.00      1.00        15
                                                 Afrikaans       1.00      1.00      1.00        12
                                                  Aguaruna       1.00      1.00      1.00        10
                                               Aja (Benin)       1.00      1.00      1.00         6

In [32]:
new_text = ["Ciao buon pomeriggio"]
# new_text = ["टेक्स्ट एक आम शब्द है जिसका मतलब अक्सर लिखित शब्दों या ऐसी चीज़ों से होता है जिनमें बहुत ज़्यादा लिखा होता है।"]
new_text = ["Guten abend!"]
new_text = ["وہ تبدیلی بنیں جو آپ دنیا میں دیکھنا چاہتے ہیں", "நம்பிக்கை, ஊக்கம், மோட்டிவேஷன். நீங்கள் பொருளீட்டுவது நலமாய் வாழ்வதற்கு, மன அழுத்தத்தினால் உங்களை நீங்களே அழிப்பதற்கல்ல.", "努力は夢中に勝てない。"]
new_text_preprocessed = [preprocess_text(text) for text in new_text]

# new_text_tfidf = vectorizer.transform(new_text_preprocessed)
# predicted_language = model.predict(new_text_tfidf)
# # predicted_language = le.inverse_transform(predicted_language)
# print(f"Predicted Language: {predicted_language[0]}")

In [33]:
predicted_language = best_model.predict(new_text_preprocessed)
print(f"Predicted Language: {predicted_language}")

Predicted Language: ['Urdu' 'Tamil' 'Japanese']
