In [32]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load CSVs
train1 = pd.read_csv('train-1.csv')
train2 = pd.read_csv('train-2.csv')
train3 = pd.read_csv('train-3.csv')
TRAIN = pd.concat([train1, train2, train3], ignore_index=True)

test1 = pd.read_csv('test-1.csv')
test2 = pd.read_csv('test-2.csv')
test3 = pd.read_csv('test-3.csv')

# Features and labels
X_train = TRAIN["Sentence"]
y_train = TRAIN["Label"]

X_train3 = train3["Sentence"]
y_train3 = train3["Label"]

X_test1 = test1["Sentence"]
y_test1 = test1["Label"]

X_test2 = test2["Sentence"]
y_test2 = test2["Label"]

X_test3 = test3["Sentence"]
y_test3 = test3["Label"]

In [33]:
def evaluate_model(model, X_tests, y_tests, test_names):
    results = []
    for X_test, y_test, name in zip(X_tests, y_tests, test_names):
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        accuracy = accuracy_score(y_test, y_pred)
        results.append({
            "Test Set": name,
            "Precision": round(report["weighted avg"]["precision"], 4),
            "Recall": round(report["weighted avg"]["recall"], 4),
            "F1-Score": round(report["weighted avg"]["f1-score"], 4),
            "Accuracy": round(accuracy, 4)
        })
    return results

In [34]:
def display_results(model_name, results):
    print(f"\n=== {model_name} Results ===")
    print("| Test Set | Precision | Recall | F1-Score | Accuracy |")
    print("|----------|-----------|--------|----------|----------|")
    for r in results:
        print(f"| {r['Test Set']} | {r['Precision']} | {r['Recall']} | {r['F1-Score']} | {r['Accuracy']} |")







In [37]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
import numpy as np
import traceback

# Logistic Regression Pipeline with RandomizedSearchCV
logreg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(random_state=42, max_iter=1000))
])

# Define a smaller hyperparameter grid for Logistic Regression
logreg_param_dist = {
    'clf__C': [0.1, 1, 10],  # Reduced range for C
    'clf__solver': ['liblinear', 'saga'],  # Fewer solvers
    'clf__penalty': ['l2'],  # 'l1' and 'elasticnet' might not perform well for large data
    'clf__max_iter': [1000],  # Reasonable number of iterations
    'clf__tol': [1e-4],  # Convergence tolerance
}

# RandomizedSearchCV for Logistic Regression with error handling
logreg_search = RandomizedSearchCV(logreg_pipeline, logreg_param_dist, n_iter=5, cv=2, n_jobs=-1, verbose=1, random_state=42, error_score='raise')
try:
    logreg_search.fit(X_train, y_train)
except Exception as e:
    print(f"Error during Logistic Regression RandomizedSearchCV: {e}")
    # You can also log more information here for debugging, like the problematic parameters
    print("The error occurred with parameters:")
    print(logreg_search.best_params_ if hasattr(logreg_search, 'best_params_') else "No best parameters found yet")
    print("Stack trace:")
    traceback.print_exc()

# If successful, get the best model and evaluate
if 'logreg_search' in locals() and hasattr(logreg_search, 'best_estimator_') and logreg_search.best_estimator_:
    logreg_best_model = logreg_search.best_estimator_
    logreg_results = evaluate_model(
        logreg_best_model,
        [X_test1, X_test2, X_test3],
        [y_test1, y_test2, y_test3],
        ["Test-1", "Test-2", "Test-3"]
    )
    display_results("Logistic Regression", logreg_results)
    print("Best hyperparameters for Logistic Regression:", logreg_search.best_params_)
else:
    print("Logistic Regression RandomizedSearchCV failed. Please check the error messages above.")

# Use SGDClassifier as an alternative for faster performance on large datasets
sgd_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SGDClassifier(loss='log_loss', max_iter=1000, random_state=42))  # Fix 'log' to 'log_loss'
])

# Define hyperparameter grid for SGDClassifier
sgd_param_dist = {
    'clf__alpha': np.logspace(-5, 5, 11),  # Regularization strength
    'clf__penalty': ['l2', 'elasticnet'],  # Regularization type
    'clf__max_iter': [1000, 2000],
}

# Perform RandomizedSearchCV for SGDClassifier with error handling
sgd_search = RandomizedSearchCV(sgd_pipeline, sgd_param_dist, n_iter=5, cv=2, n_jobs=-1, verbose=1, random_state=42, error_score='raise')
try:
    sgd_search.fit(X_train, y_train)
except Exception as e:
    print(f"Error during SGDClassifier RandomizedSearchCV: {e}")
    print("The error occurred with parameters:")
    print(sgd_search.best_params_ if hasattr(sgd_search, 'best_params_') else "No best parameters found yet")
    print("Stack trace:")
    traceback.print_exc()

# If successful, get the best model and evaluate
if 'sgd_search' in locals() and hasattr(sgd_search, 'best_estimator_') and sgd_search.best_estimator_:
    sgd_best_model = sgd_search.best_estimator_
    sgd_results = evaluate_model(
        sgd_best_model,
        [X_test1, X_test2, X_test3],
        [y_test1, y_test2, y_test3],
        ["Test-1", "Test-2", "Test-3"]
    )
    display_results("SGD Classifier", sgd_results)
    print("Best hyperparameters for SGD Classifier:", sgd_search.best_params_)
else:
    print("SGDClassifier RandomizedSearchCV failed. Please check the error messages above.")



Fitting 2 folds for each of 5 candidates, totalling 10 fits

=== Logistic Regression Results ===
| Test Set | Precision | Recall | F1-Score | Accuracy |
|----------|-----------|--------|----------|----------|
| Test-1 | 0.3369 | 0.5804 | 0.4263 | 0.5804 |
| Test-2 | 0.3383 | 0.5816 | 0.4278 | 0.5816 |
| Test-3 | 0.4223 | 0.6499 | 0.512 | 0.6499 |
Best hyperparameters for Logistic Regression: {'clf__tol': 0.0001, 'clf__solver': 'liblinear', 'clf__penalty': 'l2', 'clf__max_iter': 1000, 'clf__C': 0.1}
Fitting 2 folds for each of 5 candidates, totalling 10 fits

=== SGD Classifier Results ===
| Test Set | Precision | Recall | F1-Score | Accuracy |
|----------|-----------|--------|----------|----------|
| Test-1 | 0.3369 | 0.5804 | 0.4263 | 0.5804 |
| Test-2 | 0.3383 | 0.5816 | 0.4278 | 0.5816 |
| Test-3 | 0.4223 | 0.6499 | 0.512 | 0.6499 |
Best hyperparameters for SGD Classifier: {'clf__penalty': 'elasticnet', 'clf__max_iter': 1000, 'clf__alpha': np.float64(10000.0)}
