In [1]:
import sys
import os

# Add the parent directory to the system path
parent_dir = os.path.abspath("..")
sys.path.append(parent_dir)

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import xgboost as xgb
import joblib

In [3]:
# Load and preprocess the data
data = pd.read_csv("../data/Language_det_train.csv")
texts = data["Text"].tolist()
labels = data["Language"].tolist()

# Label encoding
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Save the label encoder
joblib.dump(label_encoder, "../encoders/label_encoder.pkl")

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels_encoded, test_size=0.2, random_state=42)

In [4]:
# Create vectorizer
vectorizer = CountVectorizer()

# Vectorize the training and test data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train and evaluate models
models = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    'Naive Bayes': MultinomialNB(),
    "XGBoost": xgb.XGBClassifier()
}

# Evaluate models
for model_name, model in models.items():
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

SVM Accuracy: 0.8920570264765784
Random Forest Accuracy: 0.9210794297352343
Naive Bayes Accuracy: 0.9760692464358453
XGBoost Accuracy: 0.9124236252545825


In [5]:
# Create vectorizer
vectorizer = CountVectorizer()

# Vectorize the training data
X_train_vectorized = vectorizer.fit_transform(X_train)

# Define the parameter grid for grid search
param_grid = {
    'alpha': [0.1, 1.0, 10.0],
    'fit_prior': [True, False]
}

classifier = MultinomialNB()
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_vectorized, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'alpha': 0.1, 'fit_prior': True}
Best Score: 0.9801427916612002


In [6]:
# Vectorize the training and test data
X_train_vectorized = vectorizer.transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train MultinomialNB with best parameters
best_naive = MultinomialNB(**best_params)
best_naive.fit(X_train_vectorized, y_train)

# Make predictions on the test data
y_pred = best_naive.predict(X_test_vectorized)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9796334012219959


In [7]:
# Create pipeline with Vectorizer and Naive Bayes
pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('naive_bayes', MultinomialNB())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Save the trained model
joblib.dump(pipeline, "../models/trained_model.pkl")

['../models/trained_model.pkl']

In [8]:
# Load the trained model
pipeline_loaded = joblib.load("../models/trained_model.pkl")

# Make predictions on the test data using the loaded model
y_pred = pipeline_loaded.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.9760692464358453
