In [66]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Load CSVs
train1 = pd.read_csv('train-1.csv')
train2 = pd.read_csv('train-2.csv')
train3 = pd.read_csv('train-3.csv')
TRAIN = pd.concat([train1, train2, train3], ignore_index=True)

test1 = pd.read_csv('test-1.csv')
test2 = pd.read_csv('test-2.csv')
test3 = pd.read_csv('test-3.csv')

# Features and labels
X_train = TRAIN["Sentence"]
y_train = TRAIN["Label"]

X_train3 = train3["Sentence"]
y_train3 = train3["Label"]

X_test1 = test1["Sentence"]
y_test1 = test1["Label"]

X_test2 = test2["Sentence"]
y_test2 = test2["Label"]

X_test3 = test3["Sentence"]
y_test3 = test3["Label"]

In [67]:
def evaluate_model(model, X_tests, y_tests, test_names):
    results = []
    for X_test, y_test, name in zip(X_tests, y_tests, test_names):
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        accuracy = accuracy_score(y_test, y_pred)
        results.append({
            "Test Set": name,
            "Precision": round(report["weighted avg"]["precision"], 4),
            "Recall": round(report["weighted avg"]["recall"], 4),
            "F1-Score": round(report["weighted avg"]["f1-score"], 4),
            "Accuracy": round(accuracy, 4)
        })
    return results

def display_results(model_name, results):
    print(f"\n=== {model_name} Results ===")
    print("| Test Set | Precision | Recall | F1-Score | Accuracy |")
    print("|----------|-----------|--------|----------|----------|")
    for r in results:
        print(f"| {r['Test Set']} | {r['Precision']} | {r['Recall']} | {r['F1-Score']} | {r['Accuracy']} |")


In [68]:
logreg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

logreg_pipeline.fit(X_train, y_train)
logreg_results = evaluate_model(
    logreg_pipeline,
    [X_test1, X_test2, X_test3],
    [y_test1, y_test2, y_test3],
    ["Test-1", "Test-2", "Test-3"]
)
display_results("Logistic Regression", logreg_results)



=== Logistic Regression Results ===
| Test Set | Precision | Recall | F1-Score | Accuracy |
|----------|-----------|--------|----------|----------|
| Test-1 | 0.4856 | 0.5758 | 0.5088 | 0.5758 |
| Test-2 | 0.5755 | 0.6113 | 0.5388 | 0.6113 |
| Test-3 | 0.5478 | 0.668 | 0.5913 | 0.668 |


In [69]:
dt_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', DecisionTreeClassifier(max_depth=10, random_state=42))
])

dt_pipeline.fit(X_train, y_train)
dt_results = evaluate_model(
    dt_pipeline,
    [X_test1, X_test2, X_test3],
    [y_test1, y_test2, y_test3],
    ["Test-1", "Test-2", "Test-3"]
)
display_results("Decision Tree", dt_results)



=== Decision Tree Results ===
| Test Set | Precision | Recall | F1-Score | Accuracy |
|----------|-----------|--------|----------|----------|
| Test-1 | 0.5089 | 0.5697 | 0.4646 | 0.5697 |
| Test-2 | 0.5046 | 0.5735 | 0.4674 | 0.5735 |
| Test-3 | 0.5284 | 0.646 | 0.5514 | 0.646 |
