In [1]:
import pandas as pd

def load_texts(file_name, label):
    with open(file_name, 'r', encoding='utf-8') as file:
        texts = file.readlines()
    data = {'text': texts, 'label': [label] * len(texts)}
    return pd.DataFrame(data)

# Load Vietnamese and Lao texts
df_vietnamese = load_texts('vi.txt', 'Vietnamese')
df_lao = load_texts('laos.txt', 'Lao')

# Combine into a single DataFrame
df_combined = pd.concat([df_vietnamese, df_lao], ignore_index=True)

# Save to a CSV file
df_combined.to_csv('combined_data.csv', index=False)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import joblib

# Load the combined dataset
df = pd.read_csv('combined_data.csv')

# Create a pipeline with a TfidfVectorizer and a Naive Bayes classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Train the model
pipeline.fit(df['text'], df['label'])

# Save the model
joblib.dump(pipeline, 'vi_laos_detection.pkl')

print("Model training completed and saved as 'vi_laos_detection.pkl'")

Model training completed and saved as 'vi_laos_detection.pkl'


In [3]:
import sys

class LanguageIdentifier:
    def __init__(self):
        self.text = ""
        self.pipeline = None

    def load_text(self, file_name):
        try:
            with open(file_name, 'r', encoding='utf-8') as file:
                self.text = file.read()
            print(f"===== Loaded text data: {file_name} =====")
            print(self.text)
        except IOError:
            print(f"Problem found when reading: {file_name}")

    def load_model(self, file_name):
        try:
            self.pipeline = joblib.load(file_name)
            print(f"===== Loaded model: {file_name} =====")
        except IOError:
            print(f"Problem found when reading: {file_name}")

    def make_instance(self):
        if self.text:
            self.instance = [self.text]
            print("===== Instance created =====")
            print(self.instance)
        else:
            print("No text data loaded.")

    def classify(self):
        if self.pipeline and self.instance:
            prediction = self.pipeline.predict(self.instance)
            print("===== Classified instance =====")
            print(f"Class predicted: {prediction[0]}")
        else:
            print("Model or instance not properly loaded or created.")

In [4]:
classifier = LanguageIdentifier()

classifier.load_text("test_laos_text.txt")
classifier.load_model("vi_laos_detection.pkl")
classifier.make_instance()
classifier.classify()

===== Loaded text data: test_laos_text.txt =====
niaemn paojk pheu thodsob phondaihab
===== Loaded model: vi_laos_detection.pkl =====
===== Instance created =====
['niaemn paojk pheu thodsob phondaihab']
===== Classified instance =====
Class predicted: Lao


In [5]:
classifier = LanguageIdentifier()

classifier.load_text("test_vi_text.txt")
classifier.load_model("vi_laos_detection.pkl")
classifier.make_instance()
classifier.classify()

===== Loaded text data: test_vi_text.txt =====
Đây là câu để thử nghiệm kết quả 
===== Loaded model: vi_laos_detection.pkl =====
===== Instance created =====
['Đây là câu để thử nghiệm kết quả ']
===== Classified instance =====
Class predicted: Vietnamese
