In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

import warnings
warnings.simplefilter("ignore")

In [2]:
import sklearn
sklearn.__version__

'1.6.1'

In [3]:
# Load the dataset
data = pd.read_csv("Language_Detection.csv")

# New Section

In [4]:
print(data.head())
print(data.isnull().sum())

                                                Text Language
0   Nature, in the broadest sense, is the natural...  English
1  "Nature" can refer to the phenomena of the phy...  English
2  The study of nature is a large, if not the onl...  English
3  Although humans are part of nature, human acti...  English
4  [1] The word nature is borrowed from the Old F...  English
Text        0
Language    0
dtype: int64


In [5]:
X = data["Text"]
y = data["Language"]

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

In [7]:
le.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [8]:
data_list = []
for text in X:
  text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
  text = re.sub(r'[[]]', ' ', text)
  text = text.lower()
  data_list.append(text)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Training set size: 8269
Testing set size: 2068


In [10]:
# Creating bag of words using countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(X_train)

x_train = cv.transform(X_train).toarray()
x_test = cv.transform(X_test).toarray()

print(x_train.shape[0])

8269


In [11]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)

In [12]:
y_pred = model.predict(x_test)

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print("Accuracy is : ", ac)

Accuracy is :  0.97678916827853


In [14]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vectorizer', cv), ('multinomialNB', model)])
pipe.fit(X_train, y_train)

In [15]:
y_pred2 = pipe.predict(X_test)
ac2 = accuracy_score(y_test, y_pred2)

print("Accuracy is :", ac2)

Accuracy is : 0.97678916827853


In [18]:
with open('model-0.1.0.pkl', 'wb') as f:
  pickle.dump(pipe, f)

In [17]:
text = "Hello World"
#text = "Bonjour le monde"
y = pipe.predict([text])
le.classes_[y[0]], y

('English', array([3]))