In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score

def train_hallucination_detector(input_csv):
    # Read the CSV file
    data = pd.read_csv(input_csv)

    # Handling missing values
    data = data.fillna('')

    # Preprocessing
    X_context = data['Context']
    X_question = data['Question']
    X_answer = data['Answer']
    y = data['Hallucination']

    # Split the dataset into training and testing sets
    X_context_train, X_context_test, \
    X_question_train, X_question_test, \
    X_answer_train, X_answer_test, \
    y_train, y_test = train_test_split(X_context, X_question, X_answer, y, test_size=0.2, random_state=42)

    # Define the pipeline for each text column separately
    clf_context = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svm', LinearSVC())
    ])
    clf_question = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svm', LinearSVC())
    ])
    clf_answer = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svm', LinearSVC())
    ])

    # Train the models
    clf_context.fit(X_context_train, y_train)
    clf_question.fit(X_question_train, y_train)
    clf_answer.fit(X_answer_train, y_train)

    # Make predictions on the test sets
    y_pred_context = clf_context.predict(X_context_test)
    y_pred_question = clf_question.predict(X_question_test)
    y_pred_answer = clf_answer.predict(X_answer_test)

    # Combine the predictions from each model using simple voting
    y_pred_combined = (y_pred_context + y_pred_question + y_pred_answer) >= 2

    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred_combined)
    print("Accuracy:", accuracy)

    # Compute F1 score
    f1 = f1_score(y_test, y_pred_combined)
    print("F1 Score:", f1)

    # Make predictions on the entire dataset
    all_predictions_context = clf_context.predict(X_context)
    all_predictions_question = clf_question.predict(X_question)
    all_predictions_answer = clf_answer.predict(X_answer)

    # Combine the predictions from each model for the entire dataset
    all_predictions_combined = (all_predictions_context + all_predictions_question + all_predictions_answer) >= 2

    # Update the 'Prediction' column with the model's predictions
    data['Prediction'] = all_predictions_combined.astype(int)

    # Save the updated DataFrame to a new CSV file
    output_csv = input_csv.split('.')[0] + '_updated.csv'
    data.to_csv(output_csv, index=False)

    print("Updated CSV file saved as:", output_csv)

input_csv = input("Enter the path to the CSV file: ")
train_hallucination_detector(input_csv)

Enter the path to the CSV file: /content/Hallucination-Dataset-400-Samples.csv
Accuracy: 0.8518518518518519
F1 Score: 0.890909090909091
Updated CSV file saved as: /content/Hallucination-Dataset-400-Samples_updated.csv
