# Language Detection
The task of the model is to detect language given as input

## Import the libraries

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


import string

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import BaggingClassifier,ExtraTreesClassifier,AdaBoostClassifier,HistGradientBoostingClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.neural_network import MLPClassifier

# initializing the tools
lemmatizer=WordNetLemmatizer()
punctuations=string.punctuation
tfidf=TfidfVectorizer()
le=LabelEncoder()



## Importing the datasets

In [2]:
dataset=pd.read_csv("lang detection/Language Detection.csv")

In [3]:
dataset.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


## Analysing the dataset

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


In [5]:
dataset.value_counts(subset='Language')

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

## Pre processing

In [6]:
def preprocess(obj):
    words=word_tokenize(obj)
    corpus=[word for word in words if word not in punctuations]
    corpus=[word.lower() for word in corpus]   
    corpus=' '.join(corpus)
    lemmatizer.lemmatize(corpus)
    return corpus

In [7]:
dataset['Text']=dataset['Text'].apply(preprocess)

In [8]:
y=le.fit_transform(dataset['Language'])

## Data Transformation

In [9]:
vectors=tfidf.fit_transform(dataset['Text'])
vectors


<10337x39917 sparse matrix of type '<class 'numpy.float64'>'
	with 163192 stored elements in Compressed Sparse Row format>

In [10]:
vectors=vectors.toarray()

In [11]:
x=vectors

## Training the model

In [12]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=42)

In [13]:
nestimators=int(len(tfidf.get_feature_names_out())*0.5)
# initializing the models
bagging=BaggingClassifier(n_estimators=nestimators)
extra_trees=ExtraTreesClassifier(n_estimators=nestimators)
# ada=AdaBoostClassifier()
# histo_grad=HistGradientBoostingClassifier()
rfc=RandomForestClassifier(n_estimators=nestimators)
gradient=GradientBoostingClassifier(n_estimators=nestimators)
mlpc=MLPClassifier()
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

clfs={
    # 'Extra Trees':extra_trees,
    # 'Ada Boost':ada,
    # 'Random Forest':rfc,
    # 'Gradient Boost':gradient,
    'MLP':mlpc,
    'Gaussian Naive Bayes':gnb,
    'Multinomial Naive Bayes':mnb,
    'Bernoulli Naive Bayes':bnb
}

In [14]:
for name,clf in clfs.items():
    print(f'Model: {name}')
    clf.fit(xtrain,ytrain)
    ypred=clf.predict(xtest)
    print('Model training completed')
    print(f'Accuracy: {accuracy_score(ytest,ypred)}')
    print(f'Precision: {precision_score(ytest, ypred, average="weighted")}')
    # print(f'Confusion Matrix:\n{confusion_matrix(ytest,ypred)}\n')

Model: MLP
Model training completed
Accuracy: 0.9584139264990329
Precision: 0.9767796308879578
Model: Gaussian Naive Bayes
Model training completed
Accuracy: 0.9787234042553191
Precision: 0.9802463670842443
Model: Multinomial Naive Bayes
Model training completed
Accuracy: 0.9468085106382979
Precision: 0.9563969500017051
Model: Bernoulli Naive Bayes
Model training completed
Accuracy: 0.5203094777562862
Precision: 0.865611165447317


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
pickle.dump(gnb, open('gnb_model.pkl','wb'))