In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.pipeline import Pipeline
import time
import warnings
warnings.filterwarnings('ignore')

# Load datasets as provided
train1 = pd.read_csv('train-1.csv')
train2 = pd.read_csv('train-2.csv')
train3 = pd.read_csv('train-3.csv')
TRAIN = pd.concat([train1, train2, train3], ignore_index=True)
test1 = pd.read_csv('test-1.csv')
test2 = pd.read_csv('test-2.csv')
test3 = pd.read_csv('test-3.csv')

# Extract features and labels
X_train = TRAIN["Sentence"]
y_train = TRAIN["Label"]
X_train3 = train3["Sentence"]
y_train3 = train3["Label"]
X_test1 = test1["Sentence"]
y_test1 = test1["Label"]
X_test2 = test2["Sentence"]
y_test2 = test2["Label"]
X_test3 = test3["Sentence"]
y_test3 = test3["Label"]

print(f"TRAIN dataset size: {len(TRAIN)}")
print(f"Train-3 dataset size: {len(train3)}")
print(f"Test-1 dataset size: {len(test1)}")
print(f"Test-2 dataset size: {len(test2)}")
print(f"Test-3 dataset size: {len(test3)}")

# Check for any missing values
print("\nChecking for missing values in datasets:")
print(f"TRAIN missing sentences: {X_train.isna().sum()}")
print(f"TRAIN missing labels: {y_train.isna().sum()}")
print(f"Train-3 missing sentences: {X_train3.isna().sum()}")
print(f"Test-1 missing sentences: {X_test1.isna().sum()}")
print(f"Test-2 missing sentences: {X_test2.isna().sum()}")
print(f"Test-3 missing sentences: {X_test3.isna().sum()}")

# Function to evaluate models
def evaluate_model(model, model_name, train_data, test_datasets):
    results = []

    # Train on the specified training data and evaluate on test sets
    X_train_data, y_train_data = train_data

    print(f"\n===== Training {model_name} on dataset with {len(X_train_data)} samples =====")
    start_time = time.time()
    model.fit(X_train_data, y_train_data)
    train_time = time.time() - start_time
    print(f"Training time: {train_time:.2f} seconds")

    # Evaluate on test datasets
    for dataset_name, (X_test, y_test) in test_datasets.items():
        start_time = time.time()
        y_pred = model.predict(X_test)
        test_time = time.time() - start_time

        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)

        results.append({
            'Training Set': f"{model_name} on {len(X_train_data)} samples",
            'Test Set': dataset_name,
            'Precision': precision,
            'Recall': recall,
            'F1': f1,
            'Accuracy': accuracy,
            'Inference Time': f"{test_time:.2f} sec"
        })

    return pd.DataFrame(results)

# Define training datasets as required
train_datasets = {
    'TRAIN (All)': (X_train, y_train),
    'Train-3': (X_train3, y_train3)
}

# Define test datasets
test_datasets = {
    'Test-1': (X_test1, y_test1),
    'Test-2': (X_test2, y_test2),
    'Test-3': (X_test3, y_test3)
}

# Print some basic info about the datasets
print("\nClass distribution in TRAIN:")
print(y_train.value_counts().to_string())
print("\nClass distribution in Train-3:")
print(y_train3.value_counts().to_string())

# Method 1: Logistic Regression with TF-IDF
# Using parameters that preserve all words/tokens
lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=None,    # Don't limit features - use all words
        min_df=1,             # Include all terms, even those appearing in just one document
        max_df=1.0,           # Include all terms, even extremely common ones
        ngram_range=(1, 2),   # Include unigrams and bigrams
        sublinear_tf=True     # Apply sublinear tf scaling for better results
    )),
    ('classifier', LogisticRegression(
        C=1.0,                # Regularization strength
        max_iter=300,         # Increased iterations for convergence
        class_weight='balanced', # Handle class imbalance
        solver='saga',        # Efficient for larger datasets
        n_jobs=-1,            # Use all CPU cores
        random_state=42       # For reproducibility
    ))
])

# Method 2: Random Forest with TF-IDF
# Using parameters that preserve all words/tokens
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=None,    # Don't limit features - use all words
        min_df=1,             # Include all terms, even those appearing in just one document
        max_df=1.0,           # Include all terms, even extremely common ones
        ngram_range=(1, 2),   # Include unigrams and bigrams
        sublinear_tf=True     # Apply sublinear tf scaling for better results
    )),
    ('classifier', RandomForestClassifier(
        n_estimators=100,     # Number of trees
        max_depth=None,       # No maximum depth - let trees grow fully
        min_samples_split=2,  # Default - split after at least 2 samples
        min_samples_leaf=1,   # Default - allow leaf nodes with just 1 sample
        max_features='sqrt',  # Consider sqrt(n_features) at each split
        class_weight='balanced', # Handle class imbalance
        n_jobs=-1,            # Use all CPU cores
        random_state=42       # For reproducibility
    ))
])

# Run evaluations for each model on each training dataset
all_results = []

print("\n==== Evaluating Logistic Regression ====")
for train_name, train_data in train_datasets.items():
    lr_results = evaluate_model(lr_pipeline, f"Logistic Regression on {train_name}", train_data, test_datasets)
    all_results.append(lr_results)

print("\n==== Evaluating Random Forest ====")
for train_name, train_data in train_datasets.items():
    rf_results = evaluate_model(rf_pipeline, f"Random Forest on {train_name}", train_data, test_datasets)
    all_results.append(rf_results)

# Combine all results
combined_results = pd.concat(all_results)
print("\n==== All Evaluation Results ====")
print(combined_results.round(4).to_string(index=False))

# Summary by training dataset and model
print("\n==== Average F1 Scores by Model and Training Set ====")
summary = combined_results.groupby(['Training Set'])['F1'].mean().reset_index()
print(summary.round(4).sort_values('F1', ascending=False).to_string(index=False))

# Find best combination
best_row = combined_results.loc[combined_results['F1'].idxmax()]
print(f"\nBest performance: {best_row['Training Set']} on {best_row['Test Set']}")
print(f"Precision: {best_row['Precision']:.4f}, Recall: {best_row['Recall']:.4f}, F1: {best_row['F1']:.4f}, Accuracy: {best_row['Accuracy']:.4f}")


TRAIN dataset size: 7278
Train-3 dataset size: 2448
Test-1 dataset size: 653
Test-2 dataset size: 741
Test-3 dataset size: 774

Checking for missing values in datasets:
TRAIN missing sentences: 0
TRAIN missing labels: 0
Train-3 missing sentences: 0
Test-1 missing sentences: 0
Test-2 missing sentences: 0
Test-3 missing sentences: 0

Class distribution in TRAIN:
Label
1    3881
0    2003
2     790
3     452
4     152

Class distribution in Train-3:
Label
1    1142
0     636
2     310
3     213
4     147

==== Evaluating Logistic Regression ====

===== Training Logistic Regression on TRAIN (All) on dataset with 7278 samples =====
Training time: 17.74 seconds

===== Training Logistic Regression on Train-3 on dataset with 2448 samples =====
Training time: 2.92 seconds

==== Evaluating Random Forest ====

===== Training Random Forest on TRAIN (All) on dataset with 7278 samples =====
Training time: 55.62 seconds

===== Training Random Forest on Train-3 on dataset with 2448 samples =====
Train

TypeError: unsupported format string passed to Series.__format__