In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import joblib

In [3]:
# Load the dataset
data = pd.read_csv("../data/dataset.csv")

# Preview the data
print(data.head())

                                                Text  language
0  klement gottwaldi surnukeha palsameeriti ning ...  Estonian
1  sebes joseph pereira thomas  på eng the jesuit...   Swedish
2  ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...      Thai
3  விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...     Tamil
4  de spons behoort tot het geslacht haliclona en...     Dutch


In [4]:
# Check for null values
print(data.isnull().sum())

Text        0
language    0
dtype: int64


In [5]:
# View language value counts
print(data["language"].value_counts())

language
Estonian      1000
Swedish       1000
Thai          1000
Tamil         1000
Dutch         1000
Japanese      1000
Turkish       1000
Latin         1000
Urdu          1000
Indonesian    1000
Portugese     1000
French        1000
Chinese       1000
Korean        1000
Hindi         1000
Spanish       1000
Pushto        1000
Persian       1000
Romanian      1000
Russian       1000
English       1000
Arabic        1000
Name: count, dtype: int64


In [6]:
# Prepare the features and labels
x = np.array(data["Text"])
y = np.array(data["language"])

# Initialize the CountVectorizer and fit it to the text data
cv = CountVectorizer()
X = cv.fit_transform(x)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Initialize the model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Evaluate the model on the test data
score = model.score(X_test, y_test)
print(f"Model Accuracy: {score * 100:.2f}%")

Model Accuracy: 95.32%


In [8]:
# Save the trained model using joblib
joblib.dump(model, '../model/lang_detect.pkl')

# Save the CountVectorizer for later use
joblib.dump(cv, '../model/count_vectorizer.pkl')

['../model/count_vectorizer.pkl']