In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import pickle

#### Fetching data

In [2]:
url_dataset = 'https://drive.google.com/file/d/1BlvYVY3f4S8zU34njCJWlVmuhJX8Yq1F/view?usp=drive_link'
file_id = url_dataset.split('/')[-2]

dwn_url = 'https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(dwn_url)

In [3]:
# df.shape

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Text,Language
0,0,സ്വയരക്ഷാബോധത്തോടും സ്വാതന്ത്ര്യത്തോടും കൂടി ജ...,Malayalam
1,1,രാഷ്ട്രീയങ്ങളല്ലാത്ത കുറ്റങ്ങൾക്കും ഐക്യരാഷ്ട്...,Malayalam
2,2,രാഷ്ട്രീയങ്ങളല്ലാത്ത കുറ്റങ്ങള്‍ക്കും ഐക്യരാഷ്...,Malayalam
3,3,തുല്യമായ പ്രവൃത്തിയെടുത്താൽ തുല്യമായ ശമ്പളത്തി...,Malayalam
4,4,സമുദായത്തിലെ സാംസ്കാരിക സംരംഭങ്ങളില്‍ പങ്കെടുക...,Malayalam


In [5]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
df['Language'].value_counts()

Language
English     2443
French      2072
Spanish     1897
Russian     1750
Dutch       1604
            ... 
Yoruba        57
Somali        57
Burmese       56
Bhojpuri      56
Magahi        53
Name: count, Length: 62, dtype: int64

#### Cleaning text - removing special chars, numbers and spaces

In [7]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', '', text)
    return text

In [8]:
texts = df['Text']
language = df['Language']

texts = [preprocess_text(text) for text in texts]

In [32]:
# len(language.unique())

In [11]:
X_train, X_test, y_train, y_test = train_test_split(texts, language, test_size=0.2, random_state=42)

#### Converting text to numerical features using TfidfVectorizer
#### Using Mutinomial Naive Bayes model for language detection

In [16]:
param_grid = {
    'vectorizer__ngram_range': [(1,1), (1,2), (2,3)],  # Unigrams, bigrams, char-level ngrams
    'vectorizer__analyzer': ['char', 'word'],
    # 'vectorizer__max_df': [0.7, 0.9],  # Ignore overly common words
    # 'vectorizer__min_df': [1, 2],  # Ignore rare words
    # 'vectorizer__use_idf': [True, False],  # With/without inverse document frequency
    'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100]  # Laplace smoothing parameter
}
cv = StratifiedKFold(n_splits=5, shuffle=True)

In [17]:
pipeline = Pipeline(steps=[('vectorizer', TfidfVectorizer()), ('model', MultinomialNB())])
model = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=cv, scoring='accuracy', verbose=2)

In [18]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 1); total time=   1.8s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 1); total time=   1.6s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 1); total time=   1.7s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 1); total time=   1.6s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 1); total time=   1.6s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 2); total time=   5.5s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 2); total time=   5.6s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1, 2); total time=   5.6s
[CV] END model__alpha=0.001, vectorizer__analyzer=char, vectorizer__ngram_range=(1

30 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hiami\anaconda3\envs\study\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hiami\anaconda3\envs\study\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hiami\anaconda3\envs\study\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [19]:
# Print best parameters and accuracy
print(f"Best Parameters: {model.best_params_}")
print(f"Best Accuracy: {model.best_score_:.4f}")
best_model = model.best_estimator_
best_model

Best Parameters: {'model__alpha': 0.001, 'vectorizer__analyzer': 'char', 'vectorizer__ngram_range': (2, 3)}
Best Accuracy: 0.9591


In [19]:
# with open("../Language_Detection/language_detector_model.pkl", "rb") as file:
#     model = pickle.load(file)


In [12]:
y_pred = model.predict(X_test)

In [13]:
accuracy_score(y_test, y_pred)*100

96.14796273119177

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Arabic       0.96      0.99      0.98       300
     Bengali       1.00      1.00      1.00        21
    Bhojpuri       0.00      0.00      0.00         8
   Bulgarian       1.00      0.58      0.74        12
     Burmese       1.00      1.00      1.00         8
     Cebuano       1.00      0.94      0.97        16
     Chinese       1.00      0.98      0.99       206
       Czech       1.00      1.00      1.00        11
      Danish       0.97      0.84      0.90        86
       Dutch       0.96      0.98      0.97       317
     English       0.87      1.00      0.93       499
    Estonian       0.98      0.96      0.97       183
      French       0.98      0.99      0.98       437
      German       0.98      0.95      0.96        95
       Greek       1.00      1.00      1.00        72
    Gujarati       1.00      1.00      1.00        10
     Haitian       1.00      0.96      0.98        24
       Hausa       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
precision = precision_score(y_test, y_pred, average='weighted') * 100
recall = recall_score(y_test, y_pred, average='weighted') * 100
accuracy = accuracy_score(y_test, y_pred) * 100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")

Accuracy: 96.15%
Precision: 96.28%
Recall: 96.15%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
new_text = ["Ciao buon pomeriggio",
            "Guten abend!",
            "وہ تبدیلی بنیں جو آپ دنیا میں دیکھنا چاہتے ہیں", 
            "நம்பிக்கை, ஊக்கம், மோட்டிவேஷன். நீங்கள் பொருளீட்டுவது நலமாய் வாழ்வதற்கு, மன அழுத்தத்தினால் உங்களை நீங்களே அழிப்பதற்கல்ல.", 
            "努力は夢中に勝てない。", 
            "टेक्स्ट एक आम शब्द है जिसका मतलब अक्सर लिखित शब्दों या ऐसी चीज़ों से होता है जिनमें बहुत ज़्यादा लिखा होता है।"]
new_text_preprocessed = [preprocess_text(text) for text in new_text]

predicted_language = best_model.predict(new_text_preprocessed)
print(f"Predicted Language: {predicted_language}")

Predicted Language: ['Italian' 'German' 'Urdu' 'Tamil' 'Japanese' 'Hindi']


In [25]:
pickle.dump(best_model, open("language_detector_model.pkl", "wb"))