In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import joblib

In [35]:
# Load the dataset
data = pd.read_csv("../data/dataset.csv")

In [36]:
# shape
print(data.shape)
# head
print(data.head(20))

(10337, 2)
                                                 Text Language
0    Nature, in the broadest sense, is the natural...  English
1   "Nature" can refer to the phenomena of the phy...  English
2   The study of nature is a large, if not the onl...  English
3   Although humans are part of nature, human acti...  English
4   [1] The word nature is borrowed from the Old F...  English
5   [2] In ancient philosophy, natura is mostly us...  English
6   [3][4] \nThe concept of nature as a whole, the...  English
7   During the advent of modern scientific method ...  English
8   [5][6] With the Industrial revolution, nature ...  English
9   However, a vitalist vision of nature, closer t...  English
10  [1] Within the various uses of the word today,...  English
11  Nature can refer to the general realm of livin...  English
12  It is often taken to mean the "natural environ...  English
13  For example, manufactured objects and human in...  English
14  This more traditional concept of natural

In [37]:
# descriptions
print(data.describe())

                 Text Language
count           10337    10337
unique          10267       17
top     mijn excuses.  English
freq                3     1385


In [38]:
# Check for null values
print(data.isnull().sum())

Text        0
Language    0
dtype: int64


In [39]:
# View language value counts
print(data["Language"].value_counts())

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64


In [40]:
# Prepare the features
x = np.array(data["Text"])
y = np.array(data["Language"])

# Initialize the CountVectorizer and fit it to the text data
cv = CountVectorizer()
X = cv.fit_transform(x)
print(X)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 163190 stored elements and shape (10337, 39928)>
  Coords	Values
  (0, 16925)	1
  (0, 12833)	1
  (0, 24867)	2
  (0, 3809)	1
  (0, 22461)	1
  (0, 13674)	1
  (0, 16915)	1
  (0, 18946)	1
  (0, 15807)	1
  (0, 27569)	1
  (0, 17998)	1
  (0, 25847)	1
  (1, 16925)	1
  (1, 12833)	1
  (1, 24867)	2
  (1, 18946)	1
  (1, 27569)	1
  (1, 4068)	1
  (1, 20888)	1
  (1, 25057)	2
  (1, 18911)	1
  (1, 17617)	1
  (1, 1603)	1
  (1, 1392)	1
  (1, 15023)	1
  :	:
  (10334, 38523)	1
  (10334, 38849)	1
  (10334, 38526)	1
  (10334, 38813)	1
  (10334, 38587)	1
  (10334, 38991)	1
  (10334, 38835)	1
  (10334, 38995)	1
  (10335, 9065)	1
  (10335, 38570)	1
  (10335, 38815)	1
  (10335, 38526)	1
  (10335, 38770)	1
  (10336, 38592)	1
  (10336, 38954)	1
  (10336, 38817)	1
  (10336, 38666)	1
  (10336, 38541)	1
  (10336, 38637)	1
  (10336, 38521)	1
  (10336, 38707)	1
  (10336, 38604)	1
  (10336, 38874)	1
  (10336, 38563)	1
  (10336, 38946)	1


In [41]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

# Initialize the model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Print model accuracy
print(f"Model Accuracy: {(model.score(X_test, y_test)) * 100:.2f}%")

Model Accuracy: 98.40%


In [42]:
# Save the trained model using joblib
joblib.dump(model, '../model/lang_detect.pkl')

# Save the CountVectorizer
joblib.dump(cv, '../model/count_vectorizer.pkl')

['../model/count_vectorizer.pkl']