In [None]:
# Simple Subjectivity Detection - 3 ML + 3 DL Models
# Easy to understand and run

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ML Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# DL Models
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
import warnings
warnings.filterwarnings('ignore')

# Disable wandb
import os
os.environ["WANDB_DISABLED"] = "true"

print("🚀 Simple Subjectivity Detection Pipeline")
print("=" * 50)

# ===== DATA LOADING =====
def load_data():
    """Load and prepare data"""
    print("📂 Loading data...")

    train_df = pd.read_csv('train_en.tsv', sep='\t')
    dev_df = pd.read_csv('dev_en.tsv', sep='\t')
    test_df = pd.read_csv('test_en_labeled.tsv', sep='\t')

    # Combine train and dev for more data
    full_train = pd.concat([train_df, dev_df], ignore_index=True)

    # Clean data
    full_train = full_train.dropna(subset=['sentence', 'label'])
    test_df = test_df.dropna(subset=['sentence', 'label'])

    # Convert labels
    label_map = {'SUBJ': 1, 'OBJ': 0}
    full_train['label_num'] = full_train['label'].map(label_map)
    test_df['label_num'] = test_df['label'].map(label_map)

    print(f"✅ Train samples: {len(full_train)}")
    print(f"✅ Test samples: {len(test_df)}")
    print(f"📊 Label distribution: {full_train['label'].value_counts().to_dict()}")

    return full_train, test_df

# ===== TRADITIONAL ML MODELS =====
def train_ml_models(train_df, test_df):
    """Train 3 ML models and report results including confusion matrix"""
    print("\n🤖 Training Traditional ML Models")
    print("-" * 40)

    # Prepare data
    X_train = train_df['sentence'].values
    y_train = train_df['label_num'].values
    X_test = test_df['sentence'].values
    y_test = test_df['label_num'].values

    # TF-IDF Vectorization
    print("📝 Creating TF-IDF features...")
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Models
    models = {
        'Naive Bayes': MultinomialNB(),
        'SVM': SVC(kernel='linear', random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
    }

    ml_results = {}

    for name, model in models.items():
        print(f"\n🔥 Training {name}...")

        # Train
        model.fit(X_train_tfidf, y_train)

        # Predict
        y_pred = model.predict(X_test_tfidf)

        # Metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred, target_names=['OBJ', 'SUBJ']) # Assuming 0: OBJ, 1: SUBJ

        print(f"✅ {name} Results:")
        print(f"   📊 Accuracy: {accuracy:.4f}")
        print(f"   🎯 F1 Score: {f1:.4f}")
        print("\n   Confusion Matrix:")
        # Using pandas to display the confusion matrix with labels for better readability
        cm_df = pd.DataFrame(conf_matrix, index=['Actual OBJ', 'Actual SUBJ'], columns=['Predicted OBJ', 'Predicted SUBJ'])
        print(cm_df)
        print("\n   Classification Report:")
        print(class_report)


        ml_results[name] = {
            'accuracy': accuracy,
            'f1_score': f1,
            'predictions': y_pred,
            'confusion_matrix': conf_matrix, # Store confusion matrix
            'classification_report': class_report # Store classification report
        }

    return ml_results


# ===== MAIN FUNCTION =====
def main():
    """Main execution"""
    # Load data
    train_df, test_df = load_data()

    # Train ML models
    ml_results = train_ml_models(train_df, test_df)

    # Final comparison
    print(f"\n{'='*60}")
    print("🏆 FINAL RESULTS COMPARISON")
    print(f"{'='*60}")

    print("\n🤖 Traditional ML Models:")
    best_ml_f1 = 0
    best_ml_model = ""

    for model_name, results in ml_results.items():
        print(f"   {model_name:<15}: Accuracy={results['accuracy']:.4f}, F1={results['f1_score']:.4f}")
        if results['f1_score'] > best_ml_f1:
            best_ml_f1 = results['f1_score']
            best_ml_model = model_name

    print(f"\n🥇 BEST ML MODEL: {best_ml_model} (F1: {best_ml_f1:.4f})")

    # Optional: Print detailed report for the best ML model again
    if best_ml_model:
        print(f"\nDetailed Report for Best ML Model: {best_ml_model}")
        print("Confusion Matrix:")
        cm_df = pd.DataFrame(ml_results[best_ml_model]['confusion_matrix'], index=['Actual OBJ', 'Actual SUBJ'], columns=['Predicted OBJ', 'Predicted SUBJ'])
        print(cm_df)
        print("\nClassification Report:")
        print(ml_results[best_ml_model]['classification_report'])


if __name__ == "__main__":
    main()

🚀 Simple Subjectivity Detection Pipeline
📂 Loading data...
✅ Train samples: 1292
✅ Test samples: 300
📊 Label distribution: {'OBJ': 754, 'SUBJ': 538}

🤖 Training Traditional ML Models
----------------------------------------
📝 Creating TF-IDF features...

🔥 Training Naive Bayes...
✅ Naive Bayes Results:
   📊 Accuracy: 0.6533
   🎯 F1 Score: 0.6415

   Confusion Matrix:
             Predicted OBJ  Predicted SUBJ
Actual OBJ             171              44
Actual SUBJ             60              25

   Classification Report:
              precision    recall  f1-score   support

         OBJ       0.74      0.80      0.77       215
        SUBJ       0.36      0.29      0.32        85

    accuracy                           0.65       300
   macro avg       0.55      0.54      0.55       300
weighted avg       0.63      0.65      0.64       300


🔥 Training SVM...
✅ SVM Results:
   📊 Accuracy: 0.5833
   🎯 F1 Score: 0.5937

   Confusion Matrix:
             Predicted OBJ  Predicted SUBJ
Actu

In [None]:
# Simple & Precise Subjectivity Detection - 3 Deep Learning Models
# Models: CNN, LSTM, CNN+LSTM Combined

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix # Import confusion_matrix
from collections import Counter
import re

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# =============================================================================
# 1. SIMPLE TEXT PREPROCESSING
# =============================================================================

def build_vocab(texts, max_vocab=20000):
    """Build vocabulary from texts"""
    print("Building vocabulary...")

    # Clean and tokenize
    all_words = []
    for text in texts:
        words = str(text).lower().split()
        all_words.extend(words)

    # Count frequencies
    word_counts = Counter(all_words)

    # Create vocab
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, count in word_counts.most_common(max_vocab-2):
        vocab[word] = len(vocab)

    print(f"Vocabulary size: {len(vocab)}")
    return vocab

def text_to_sequence(text, vocab, max_len=100):
    """Convert text to sequence of numbers"""
    words = str(text).lower().split()
    sequence = [vocab.get(word, 1) for word in words]  # 1 = <UNK>

    # Pad or truncate
    if len(sequence) > max_len:
        sequence = sequence[:max_len]
    else:
        sequence += [0] * (max_len - len(sequence))  # 0 = <PAD>

    return sequence

# =============================================================================
# 2. DATASET CLASS
# =============================================================================

class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        sequence = text_to_sequence(text, self.vocab, self.max_len)

        return {
            'text': torch.tensor(sequence, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

# =============================================================================
# 3. MODEL 1: CNN MODEL
# =============================================================================

class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_filters=100):
        super().__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # CNN layers with different filter sizes
        self.conv1 = nn.Conv1d(embed_dim, num_filters, kernel_size=3)
        self.conv2 = nn.Conv1d(embed_dim, num_filters, kernel_size=4)
        self.conv3 = nn.Conv1d(embed_dim, num_filters, kernel_size=5)

        # Classification layers
        self.fc1 = nn.Linear(num_filters * 3, 128)
        self.fc2 = nn.Linear(128, 2)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)
        embedded = embedded.transpose(1, 2)  # (batch, embed_dim, seq_len)

        # CNN features
        conv1_out = F.relu(self.conv1(embedded))
        conv2_out = F.relu(self.conv2(embedded))
        conv3_out = F.relu(self.conv3(embedded))

        # Global max pooling
        pool1 = F.max_pool1d(conv1_out, conv1_out.size(2)).squeeze(2)
        pool2 = F.max_pool1d(conv2_out, conv2_out.size(2)).squeeze(2)
        pool3 = F.max_pool1d(conv3_out, conv3_out.size(2)).squeeze(2)

        # Concatenate features
        features = torch.cat([pool1, pool2, pool3], dim=1)

        # Classification
        x = F.relu(self.fc1(features))
        x = self.dropout(x)
        x = self.fc2(x)

        return x

# =============================================================================
# 4. MODEL 2: LSTM MODEL
# =============================================================================

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128):
        super().__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # LSTM layer
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True,
                           num_layers=2, dropout=0.3, bidirectional=True)

        # Classification layers
        self.fc1 = nn.Linear(hidden_dim * 2, 128)  # *2 for bidirectional
        self.fc2 = nn.Linear(128, 2)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)

        # LSTM
        lstm_out, (hidden, cell) = self.lstm(embedded)

        # Use last hidden state from both directions
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)

        # Classification
        x = F.relu(self.fc1(hidden))
        x = self.dropout(x)
        x = self.fc2(x)

        return x

# =============================================================================
# 5. MODEL 3: CNN + LSTM COMBINED
# =============================================================================

class CNNLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_filters=64):
        super().__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # CNN branch
        self.conv1 = nn.Conv1d(embed_dim, num_filters, kernel_size=3)
        self.conv2 = nn.Conv1d(embed_dim, num_filters, kernel_size=5)

        # LSTM branch
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True,
                           bidirectional=True)

        # Combined features
        combined_dim = (num_filters * 2) + (hidden_dim * 2)

        # Classification layers
        self.fc1 = nn.Linear(combined_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 2)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)

        # CNN branch
        cnn_input = embedded.transpose(1, 2)
        conv1_out = F.relu(self.conv1(cnn_input))
        conv2_out = F.relu(self.conv2(cnn_input))

        # Global max pooling for CNN
        cnn_pool1 = F.max_pool1d(conv1_out, conv1_out.size(2)).squeeze(2)
        cnn_pool2 = F.max_pool1d(conv2_out, conv2_out.size(2)).squeeze(2)
        cnn_features = torch.cat([cnn_pool1, cnn_pool2], dim=1)

        # LSTM branch
        lstm_out, (hidden, cell) = self.lstm(embedded)
        lstm_features = torch.cat([hidden[-2], hidden[-1]], dim=1)

        # Combine CNN and LSTM features
        combined = torch.cat([cnn_features, lstm_features], dim=1)

        # Classification
        x = F.relu(self.fc1(combined))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)

        return x

# =============================================================================
# 6. TRAINING FUNCTION
# =============================================================================

def train_model(model, train_loader, val_loader, epochs=10, lr=0.001):
    """Train any model"""
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss()

    best_f1 = 0
    best_model = None

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            texts = batch['text'].to(device)
            labels = batch['label'].to(device)

            outputs = model(texts)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation
        # Update to unpack all 4 returned values from evaluate_model
        val_acc, val_f1, _, _ = evaluate_model(model, val_loader)

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss/len(train_loader):.4f}")
        print(f"Val Accuracy: {val_acc:.4f}, Val F1: {val_f1:.4f}")

        # Save best model
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model = model.state_dict().copy()

        print("-" * 40)

    # Load best model
    model.load_state_dict(best_model)
    return model
# =============================================================================
# 7. EVALUATION FUNCTION
# =============================================================================

def evaluate_model(model, data_loader):
    """Evaluate model and return predictions and true labels"""
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            texts = batch['text'].to(device)
            labels = batch['label'].to(device)

            outputs = model(texts)
            preds = torch.argmax(outputs, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')

    return accuracy, f1, true_labels, predictions # Return true_labels and predictions

Using device: cuda


In [None]:
# =============================================================================
# 8. MAIN FUNCTION
# =============================================================================

def main():
    print("Loading datasets...")

    # Load data
    train_df = pd.read_csv('train_en.tsv', sep='\t')
    dev_df = pd.read_csv('dev_en.tsv', sep='\t')
    test_df = pd.read_csv('test_en_labeled.tsv', sep='\t')

    # Convert labels
    label_map = {'SUBJ': 1, 'OBJ': 0}
    train_df['label_num'] = train_df['label'].map(label_map)
    dev_df['label_num'] = dev_df['label'].map(label_map)
    test_df['label_num'] = test_df['label'].map(label_map)

    print(f"Train: {len(train_df)}, Dev: {len(dev_df)}, Test: {len(test_df)}")

    # Build vocabulary
    all_texts = list(train_df['sentence']) + list(dev_df['sentence'])
    vocab = build_vocab(all_texts)

    # Create datasets
    max_len = 100
    batch_size = 32

    train_dataset = TextDataset(train_df['sentence'], train_df['label_num'], vocab, max_len)
    dev_dataset = TextDataset(dev_df['sentence'], dev_df['label_num'], vocab, max_len)
    test_dataset = TextDataset(test_df['sentence'], test_df['label_num'], vocab, max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    vocab_size = len(vocab)

    # =============================================================================
    # MODEL 1: CNN
    # =============================================================================
    print("\n" + "="*50)
    print("TRAINING CNN MODEL")
    print("="*50)

    cnn_model = CNNModel(vocab_size).to(device)
    print(f"CNN Parameters: {sum(p.numel() for p in cnn_model.parameters()):,}")

    cnn_model = train_model(cnn_model, train_loader, dev_loader, epochs=8)

    # Test CNN
    cnn_test_acc, cnn_test_f1, cnn_true, cnn_pred = evaluate_model(cnn_model, test_loader)
    print(f"CNN Test - Accuracy: {cnn_test_acc:.4f}, F1: {cnn_test_f1:.4f}")
    print("\nCNN Confusion Matrix:")
    cnn_conf_matrix = confusion_matrix(cnn_true, cnn_pred)
    cm_df_cnn = pd.DataFrame(cnn_conf_matrix, index=['Actual OBJ', 'Actual SUBJ'], columns=['Predicted OBJ', 'Predicted SUBJ'])
    print(cm_df_cnn)
    print("\nCNN Classification Report:")
    print(classification_report(cnn_true, cnn_pred, target_names=['OBJ', 'SUBJ']))


    # =============================================================================
    # MODEL 2: LSTM
    # =============================================================================
    print("\n" + "="*50)
    print("TRAINING LSTM MODEL")
    print("="*50)

    lstm_model = LSTMModel(vocab_size).to(device)
    print(f"LSTM Parameters: {sum(p.numel() for p in lstm_model.parameters()):,}")

    lstm_model = train_model(lstm_model, train_loader, dev_loader, epochs=8)

    # Test LSTM
    lstm_test_acc, lstm_test_f1, lstm_true, lstm_pred = evaluate_model(lstm_model, test_loader)
    print(f"LSTM Test - Accuracy: {lstm_test_acc:.4f}, F1: {lstm_test_f1:.4f}")
    print("\nLSTM Confusion Matrix:")
    lstm_conf_matrix = confusion_matrix(lstm_true, lstm_pred)
    cm_df_lstm = pd.DataFrame(lstm_conf_matrix, index=['Actual OBJ', 'Actual SUBJ'], columns=['Predicted OBJ', 'Predicted SUBJ'])
    print(cm_df_lstm)
    print("\nLSTM Classification Report:")
    print(classification_report(lstm_true, lstm_pred, target_names=['OBJ', 'SUBJ']))


    # =============================================================================
    # MODEL 3: CNN + LSTM
    # =============================================================================
    print("\n" + "="*50)
    print("TRAINING CNN+LSTM MODEL")
    print("="*50)

    combined_model = CNNLSTMModel(vocab_size).to(device)
    print(f"Combined Parameters: {sum(p.numel() for p in combined_model.parameters()):,}")

    combined_model = train_model(combined_model, train_loader, dev_loader, epochs=10)

    # Test Combined
    combined_test_acc, combined_test_f1, combined_true, combined_pred = evaluate_model(combined_model, test_loader)
    print(f"Combined Test - Accuracy: {combined_test_acc:.4f}, F1: {combined_test_f1:.4f}")
    print("\nCombined Confusion Matrix:")
    combined_conf_matrix = confusion_matrix(combined_true, combined_pred)
    cm_df_combined = pd.DataFrame(combined_conf_matrix, index=['Actual OBJ', 'Actual SUBJ'], columns=['Predicted OBJ', 'Predicted SUBJ'])
    print(cm_df_combined)
    print("\nCombined Classification Report:")
    print(classification_report(combined_true, combined_pred, target_names=['OBJ', 'SUBJ']))


    # =============================================================================
    # FINAL RESULTS COMPARISON
    # =============================================================================
    print("\n" + "="*60)
    print("FINAL RESULTS COMPARISON")
    print("="*60)
    print(f"CNN Model      - Accuracy: {cnn_test_acc:.4f}, F1: {cnn_test_f1:.4f}")
    print(f"LSTM Model     - Accuracy: {lstm_test_acc:.4f}, F1: {lstm_test_f1:.4f}")
    print(f"Combined Model - Accuracy: {combined_test_acc:.4f}, F1: {combined_test_f1:.4f}")

    # Find best model
    models_results = [
        ("CNN", cnn_test_f1, cnn_true, cnn_pred),
        ("LSTM", lstm_test_f1, lstm_true, lstm_pred),
        ("Combined", combined_test_f1, combined_true, combined_pred)
    ]

    best_name, best_f1, best_true, best_pred = max(models_results, key=lambda x: x[1])
    print(f"\nBest Model: {best_name} with F1 Score: {best_f1:.4f}")

    # Detailed report for best model
    print(f"\nDetailed Classification Report for {best_name} Model:")
    print(classification_report(best_true, best_pred, target_names=['OBJ', 'SUBJ']))

    print(f"\nConfusion Matrix for Best Model ({best_name}):")
    best_conf_matrix = confusion_matrix(best_true, best_pred)
    cm_df_best = pd.DataFrame(best_conf_matrix, index=['Actual OBJ', 'Actual SUBJ'], columns=['Predicted OBJ', 'Predicted SUBJ'])
    print(cm_df_best)


# =============================================================================
# 9. PREDICTION FUNCTION
# =============================================================================

def predict_text(text, model_path):
    """Predict single text"""
    # Load model
    checkpoint = torch.load(model_path)
    vocab = checkpoint['vocab']
    model_type = checkpoint['model_type']
    vocab_size = checkpoint['vocab_size']

    # Initialize model based on type
    if model_type == 'CNN':
        model = CNNModel(vocab_size)
    elif model_type == 'LSTM':
        model = LSTMModel(vocab_size)
    elif model_type == 'Combined':
        model = CNNLSTMModel(vocab_size)

    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()

    # Process text
    sequence = text_to_sequence(text, vocab)
    input_tensor = torch.tensor([sequence], dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        probabilities = F.softmax(output, dim=1)
        prediction = torch.argmax(output, dim=1)

    label = "SUBJ" if prediction.item() == 1 else "OBJ"
    confidence = probabilities[0][prediction].item()

    return label, confidence

if __name__ == "__main__":
    main()

Loading datasets...
Train: 830, Dev: 462, Test: 300
Building vocabulary...
Vocabulary size: 8041

TRAINING CNN MODEL
CNN Parameters: 1,221,934
Epoch 1/8
Train Loss: 0.6992
Val Accuracy: 0.4805, Val F1: 0.3119
----------------------------------------
Epoch 2/8
Train Loss: 0.6209
Val Accuracy: 0.4805, Val F1: 0.3119
----------------------------------------
Epoch 3/8
Train Loss: 0.5244
Val Accuracy: 0.4870, Val F1: 0.3470
----------------------------------------
Epoch 4/8
Train Loss: 0.3232
Val Accuracy: 0.4913, Val F1: 0.3492
----------------------------------------
Epoch 5/8
Train Loss: 0.1380
Val Accuracy: 0.4935, Val F1: 0.3692
----------------------------------------
Epoch 6/8
Train Loss: 0.0455
Val Accuracy: 0.4978, Val F1: 0.3939
----------------------------------------
Epoch 7/8
Train Loss: 0.0191
Val Accuracy: 0.5152, Val F1: 0.4547
----------------------------------------
Epoch 8/8
Train Loss: 0.0118
Val Accuracy: 0.5152, Val F1: 0.4396
----------------------------------------
C