In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Kaggle Dataset - https://www.kaggle.com/datasets/basilb2s/language-detection

df = pd.read_csv('/content/Language Detection.csv')

In [None]:
df

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [None]:
df.shape

(10337, 2)

In [None]:
df['Language'].unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [None]:
df['Language'].value_counts()

Unnamed: 0_level_0,count
Language,Unnamed: 1_level_1
English,1385
French,1014
Spanish,819
Portugeese,739
Italian,698
Russian,692
Sweedish,676
Malayalam,594
Dutch,546
Arabic,536


In [None]:
count_vectorizer = CountVectorizer()
count_vectorizer_text = count_vectorizer.fit_transform(df['Text'])

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer_text = tfidf_vectorizer.fit_transform(df['Text'])

In [None]:
Processed_text = hstack((count_vectorizer_text, tfidf_vectorizer_text))

In [None]:
X = Processed_text
y = df['Language']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
multinomial = MultinomialNB()
model = multinomial.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

0.9830754352030948

In [None]:
y_pred = model.predict(X_test)

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.9830754352030948
Classification Report:
               precision    recall  f1-score   support

      Arabic       1.00      0.98      0.99       106
      Danish       0.97      0.96      0.97        73
       Dutch       0.98      0.97      0.98       111
     English       0.92      1.00      0.96       291
      French       0.99      0.99      0.99       219
      German       1.00      0.97      0.98        93
       Greek       1.00      0.99      0.99        68
       Hindi       1.00      1.00      1.00        10
     Italian       1.00      0.99      1.00       145
     Kannada       1.00      1.00      1.00        66
   Malayalam       1.00      0.98      0.99       121
  Portugeese       0.99      0.98      0.99       144
     Russian       1.00      0.99      1.00       136
     Spanish       0.99      0.97      0.98       160
    Sweedish       1.00      0.98      0.99       133
       Tamil       1.00      0.99      0.99        87
     Turkish       1.00     

In [None]:
User = input("Enter a Text: ")
count_vectorizer_user = count_vectorizer.transform([User])
tfidf_vectorizer_user = tfidf_vectorizer.transform([User])
Processed_user_text = hstack((count_vectorizer_user, tfidf_vectorizer_user))
language_detection = model.predict(Processed_user_text)
print(language_detection)

Enter a Text: Love!
['English']
