In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from scipy.stats import uniform, randint

# Load data (same as before)
train1 = pd.read_csv('train-1.csv')
train2 = pd.read_csv('train-2.csv')
train3 = pd.read_csv('train-3.csv')
TRAIN = pd.concat([train1, train2, train3], ignore_index=True)

test1 = pd.read_csv('test-1.csv')
test2 = pd.read_csv('test-2.csv')
test3 = pd.read_csv('test-3.csv')

X_train = TRAIN["Sentence"]
y_train = TRAIN["Label"]

X_train3 = train3["Sentence"]
y_train3 = train3["Label"]

X_test1 = test1["Sentence"]
y_test1 = test1["Label"]

X_test2 = test2["Sentence"]
y_test2 = test2["Label"]

X_test3 = test3["Sentence"]
y_test3 = test3["Label"]

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_train3_vec = vectorizer.transform(X_train3)
X_test1_vec = vectorizer.transform(X_test1)
X_test2_vec = vectorizer.transform(X_test2)
X_test3_vec = vectorizer.transform(X_test3)

# Define models and hyperparameter distributions for RandomizedSearchCV
param_dist_logreg = {
    'C': uniform(0.01, 10),          # Regularization strength between 0.01 and 10
    'penalty': ['l2'],               # 'l1' can be used with saga, but l2 is more stable
    'solver': ['saga'],              # saga supports l1 and l2, good for sparse data
    'max_iter': [200, 300, 400]
}

param_dist_rf = {
    'n_estimators': randint(50, 150),    # Number of trees between 50 and 150
    'max_depth': randint(10, 40),        # Max depth between 10 and 40
    'min_samples_split': randint(2, 10), # Minimum samples to split nodes
    'min_samples_leaf': randint(1, 5),   # Minimum samples at leaf nodes
    'bootstrap': [True, False]
}

# Setup RandomizedSearchCV for Logistic Regression
logreg = LogisticRegression(random_state=42, n_jobs=-1)
rs_logreg = RandomizedSearchCV(
    logreg, param_distributions=param_dist_logreg,
    n_iter=20, scoring='f1_weighted', cv=3, verbose=2, random_state=42, n_jobs=-1
)

# Setup RandomizedSearchCV for Random Forest
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rs_rf = RandomizedSearchCV(
    rf, param_distributions=param_dist_rf,
    n_iter=20, scoring='f1_weighted', cv=3, verbose=2, random_state=42, n_jobs=-1
)

# Fit and tune Logistic Regression
print("Tuning Logistic Regression...")
rs_logreg.fit(X_train_vec, y_train)
print("Best Logistic Regression params:", rs_logreg.best_params_)

# Fit and tune Random Forest
print("\nTuning Random Forest...")
rs_rf.fit(X_train_vec, y_train)
print("Best Random Forest params:", rs_rf.best_params_)

# Function to evaluate model on multiple datasets
def evaluate_model(name, model, X_train, y_train, X_train3, y_train3, X_tests, y_tests, test_names):
    print(f"\nEvaluating {name} with best hyperparameters:")
    y_pred_train = model.predict(X_train)
    print("  On TRAIN set:")
    print(f"    Accuracy: {accuracy_score(y_train, y_pred_train):.4f}")
    print(f"    Precision: {precision_score(y_train, y_pred_train, average='weighted'):.4f}")
    print(f"    Recall: {recall_score(y_train, y_pred_train, average='weighted'):.4f}")
    print(f"    F1 Score: {f1_score(y_train, y_pred_train, average='weighted'):.4f}")

    y_pred_train3 = model.predict(X_train3)
    print("  On train3 subset:")
    print(f"    Accuracy: {accuracy_score(y_train3, y_pred_train3):.4f}")
    print(f"    Precision: {precision_score(y_train3, y_pred_train3, average='weighted'):.4f}")
    print(f"    Recall: {recall_score(y_train3, y_pred_train3, average='weighted'):.4f}")
    print(f"    F1 Score: {f1_score(y_train3, y_pred_train3, average='weighted'):.4f}")

    for X_test, y_test, tname in zip(X_tests, y_tests, test_names):
        y_pred_test = model.predict(X_test)
        print(f"  On {tname}:")
        print(f"    Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
        print(f"    Precision: {precision_score(y_test, y_pred_test, average='weighted'):.4f}")
        print(f"    Recall: {recall_score(y_test, y_pred_test, average='weighted'):.4f}")
        print(f"    F1 Score: {f1_score(y_test, y_pred_test, average='weighted'):.4f}")

# Prepare test sets and names
test_sets = [X_test1_vec, X_test2_vec, X_test3_vec]
test_labels = [y_test1, y_test2, y_test3]
test_names = ['Test 1', 'Test 2', 'Test 3']

# Evaluate Logistic Regression best model
best_logreg = rs_logreg.best_estimator_
evaluate_model("Logistic Regression", best_logreg, X_train_vec, y_train, X_train3_vec, y_train3, test_sets, test_labels, test_names)

# Evaluate Random Forest best model
best_rf = rs_rf.best_estimator_
evaluate_model("Random Forest", best_rf, X_train_vec, y_train, X_train3_vec, y_train3, test_sets, test_labels, test_names)


Tuning Logistic Regression...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Logistic Regression params: {'C': np.float64(9.842308858067883), 'max_iter': 200, 'penalty': 'l2', 'solver': 'saga'}

Tuning Random Forest...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Random Forest params: {'bootstrap': False, 'max_depth': 37, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 125}

Evaluating Logistic Regression with best hyperparameters:
  On TRAIN set:
    Accuracy: 0.9985
    Precision: 0.9985
    Recall: 0.9985
    F1 Score: 0.9985
  On train3 subset:
    Accuracy: 0.9975
    Precision: 0.9976
    Recall: 0.9975
    F1 Score: 0.9975
  On Test 1:
    Accuracy: 0.6141
    Precision: 0.5680
    Recall: 0.6141
    F1 Score: 0.5863
  On Test 2:
    Accuracy: 0.6208
    Precision: 0.5958
    Recall: 0.6208
    F1 Score: 0.6028
  On Test 3:
    Accuracy: 0.6227
    Precision: 0.5619
    Recall: 0.6227
    F1 Score: 0.5841

Evaluating Random 