In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import joblib

def load_and_clean_data(csv_path):
    """
    Load CSV dataset with flexible parsing and clean it:
    - Remove rows with missing or empty 'pertanyaan' or 'jawaban'
    - Strip whitespace and lowercase 'pertanyaan'
    - Remove duplicates
    """
    if not os.path.isfile(csv_path):
        raise FileNotFoundError(f"CSV file '{csv_path}' not found.")

    print(f"Loading dataset from '{csv_path}'...")
    try:
        df = pd.read_csv(csv_path, engine='python', encoding='utf-8', quotechar='"')
    except Exception as e:
        raise Exception(f"Failed to read CSV file: {e}")

    required_columns = {'pertanyaan', 'jawaban'}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"CSV must contain columns {required_columns}. Found columns: {df.columns}")

    print(f"Initial dataset size: {len(df)} rows")

    # Drop rows with missing or empty 'pertanyaan' or 'jawaban'
    df = df.dropna(subset=['pertanyaan', 'jawaban'])
    df = df[df['pertanyaan'].str.strip() != '']
    df = df[df['jawaban'].str.strip() != '']
    print(f"After dropping missing/empty rows: {len(df)} rows")

    # Strip whitespace and lowercase 'pertanyaan'
    df['pertanyaan'] = df['pertanyaan'].str.strip().str.lower()
    df['jawaban'] = df['jawaban'].str.strip()

    # Remove duplicates
    df = df.drop_duplicates()
    print(f"After removing duplicates: {len(df)} rows")

    return df

def train_and_save_model(df, model_path='naive_bayes_model.pkl', vectorizer_path='vectorizer.pkl'):
    """
    Train a Multinomial Naive Bayes model on the cleaned dataset and save the model and vectorizer.
    """
    X = df['pertanyaan'].astype(str)
    y = df['jawaban'].astype(str)

    print("Vectorizing text data...")
    vectorizer = CountVectorizer()
    X_vec = vectorizer.fit_transform(X)

    print("Training Naive Bayes classifier...")
    model = MultinomialNB()
    model.fit(X_vec, y)

    print(f"Saving model to '{model_path}' and vectorizer to '{vectorizer_path}'...")
    joblib.dump(model, model_path)
    joblib.dump(vectorizer, vectorizer_path)

    print("Training complete and files saved.")

def main():
    # Adjust this path to your CSV file location
    csv_filename = '/content/out_dataset_pertanyaan_jawaban.csv'

    # Load and clean data
    df_cleaned = load_and_clean_data(csv_filename)

    # Train model and save artifacts
    train_and_save_model(df_cleaned)

if __name__ == "__main__":
    main()


Loading dataset from '/content/out_dataset_pertanyaan_jawaban.csv'...
Initial dataset size: 16 rows
After dropping missing/empty rows: 16 rows
After removing duplicates: 16 rows
Vectorizing text data...
Training Naive Bayes classifier...
Saving model to 'naive_bayes_model.pkl' and vectorizer to 'vectorizer.pkl'...
Training complete and files saved.


In [None]:
import os
import joblib

# Define the filenames for the saved model and vectorizer
model_path = 'naive_bayes_model.pkl'
vectorizer_path = 'vectorizer.pkl'

# Check if the model and vectorizer files exist
if not os.path.isfile(model_path):
    raise FileNotFoundError(f"Model file '{model_path}' not found. Please train and save the model first.")
if not os.path.isfile(vectorizer_path):
    raise FileNotFoundError(f"Vectorizer file '{vectorizer_path}' not found. Please train and save the vectorizer first.")

# Load the saved model and vectorizer
model = joblib.load(model_path)
vectorizer = joblib.load(vectorizer_path)

def predict_answer(question):
    """
    Predict the answer for a given question using the loaded Naive Bayes model.

    Parameters:
    - question (str): The input question text.

    Returns:
    - str: The predicted answer.
    """
    # Transform the input question using the loaded vectorizer
    question_vec = vectorizer.transform([question])
    # Predict the answer using the loaded model
    predicted_answer = model.predict(question_vec)
    return predicted_answer[0]

# Example usage
if __name__ == "__main__":
    sample_question = "halo"
    predicted = predict_answer(sample_question)
    print(f"Question: {sample_question}")
    print(f"Predicted Answer: {predicted}")


Question: halo
Predicted Answer: Berikut pilihan kost dalam rentang harga tersebut:
• Kost Mawar — Rp1.000.000/bulan (AC, Kamar Mandi Dalam)
• Kost Melati — Rp1.200.000/bulan (AC, WiFi, Parkir Motor)
• Kost Anggrek — Rp1.500.000/bulan (Lengkap, Dapur Bersama)
