In [1]:
import pandas as pd
import re
import numpy as np

In [4]:
from google.colab import drive
drive.mount('/content/drive')

# Read in data
train = pd.read_csv('/content/drive/MyDrive/cs_data/train.csv')
val = pd.read_csv('/content/drive/MyDrive/cs_data/val.csv')


Mounted at /content/drive


In [5]:
# Helper function for cleaning text
def clean_html(text):
    if pd.isna(text):
        return text
    # Remove HTML tags
    clean = re.sub(r'<.*?>', '', str(text))
    # Remove extra whitespaces
    clean = re.sub(r'\s+', ' ', clean).strip()
    # Replace HTML entities
    clean = re.sub(r'&amp;', '&', clean)
    clean = re.sub(r'&lt;', '<', clean)
    clean = re.sub(r'&gt;', '>', clean)
    clean = re.sub(r'&quot;|&#34;', '"', clean)
    clean = re.sub(r'&apos;|&#39;', "'", clean)
    return clean

In [7]:
#cleaning data
train = train.map(clean_html)
val = val.map(clean_html)

In [6]:
#pre-processing for Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
train_features_cv = count_vectorizer.fit_transform(train['snip'])
val_features_cv = count_vectorizer.transform(val['snip'])

KeyboardInterrupt: 

In [None]:
#training LR model for Count Vectorizer
from sklearn.linear_model import LogisticRegression
LR_cv = LogisticRegression(solver='saga', max_iter=1000)
LR_cv.fit(train_features_cv, train['channel'])
predictions_cv = LR_cv.predict(val_features_cv)

In [None]:
#pre-processing for Tfdif Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
train_features_tv = tfidf_vectorizer.fit_transform(train['snip'])
val_features_tv = tfidf_vectorizer.transform(val['snip'])

In [None]:
#training LR model for Tfdif Vectorizer
from sklearn.linear_model import LogisticRegression
LR_tv = LogisticRegression(solver='saga', max_iter=10000)
LR_tv.fit(train_features_tv, train['channel'])
predictions_tv = LR_tv.predict(val_features_tv)

In [8]:
# Neural Network Representation Learning + Logistic Regression
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
import pandas as pd
from tqdm.auto import tqdm

# Check if GPU is available
device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

train['snip'] = train['snip'].apply(clean_html)
val['snip'] = val['snip'].apply(clean_html)

# Vocabulary class
class Vocabulary:
    """Simple vocabulary class to convert words to indices"""
    def __init__(self, min_freq=2, max_size=50000):
        self.word2idx = {"<pad>": 0, "<unk>": 1}
        self.idx2word = {0: "<pad>", 1: "<unk>"}
        self.word_counts = {}
        self.min_freq = min_freq
        self.max_size = max_size

    def preprocess_text(self, text):
        # Just lowercase and split
        if pd.isna(text):
            return []
        return text.lower().split()

    def build_vocab(self, texts):
        print("Building vocabulary...")
        # Count word frequencies
        for text in texts:
            words = self.preprocess_text(text)
            for word in words:
                if word not in self.word_counts:
                    self.word_counts[word] = 0
                self.word_counts[word] += 1

        # Sort words by frequency in descending order
        sorted_words = sorted(self.word_counts.items(), key=lambda x: x[1], reverse=True)

        # Add words that meet minimum frequency to vocabulary
        idx = len(self.word2idx)
        for word, count in sorted_words:
            if count >= self.min_freq and idx < self.max_size:
                self.word2idx[word] = idx
                self.idx2word[idx] = word
                idx += 1

        print(f"Vocabulary built with {len(self.word2idx)} words")

    def text_to_indices(self, text, max_length=None):
        words = self.preprocess_text(text)
        # Convert words to indices
        indices = []
        for word in words:
            if word in self.word2idx:
                indices.append(self.word2idx[word])
            else:
                indices.append(self.word2idx["<unk>"])
        # Truncate if needed
        if max_length is not None and len(indices) > max_length:
            indices = indices[:max_length]
        return indices

# Dataset class
class TextDataset(Dataset):
    """Custom Dataset"""
    def __init__(self, texts, labels, vocab, max_length=200):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        indices = self.vocab.text_to_indices(text, self.max_length)
        # Make sure indices is not empty
        if len(indices) == 0:
            indices = [1]  # Use <unk> token if empty
        # Ensure label is an integer
        label = int(self.labels[idx])
        return {
            'text_indices': torch.tensor(indices, dtype=torch.long),
            'text_length': len(indices),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Collate function for DataLoader
def collate_batch(batch):
    text_indices = [item['text_indices'] for item in batch]
    text_lengths = torch.tensor([item['text_length'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])

    # Pad sequences
    padded_text = pad_sequence(text_indices, batch_first=True, padding_value=0)

    return {
        'text_indices': padded_text,
        'text_length': text_lengths,
        'label': labels
    }

# Neural Network Model
class RepresentationModel(nn.Module):
    """Model that learns text representations"""
    def __init__(self, vocab_size, num_classes, embedding_dim=300, hidden_dim=256,
                 num_layers=2, bidirectional=True):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )
        # If bidirectional, we have 2x hidden dimensions
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc = nn.Linear(lstm_output_dim, num_classes)

    def forward(self, text_indices, text_lengths):
        # Get embeddings
        embedded = self.embedding(text_indices)

        # Pack sequence for LSTM
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False
        )

        # Pass through LSTM
        output, (hidden, cell) = self.lstm(packed)

        # Use the final hidden state
        if self.bidirectional:
            # Concatenate the last hidden state from both directions
            hidden_final = torch.cat((hidden[-2], hidden[-1]), dim=1)
        else:
            hidden_final = hidden[-1]

        # Pass through fully connected layer
        return self.fc(hidden_final)

    def get_representation(self, text_indices, text_lengths):
        # Get embeddings
        embedded = self.embedding(text_indices)

        # Pack sequence for LSTM
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False
        )

        # Pass through LSTM
        output, (hidden, cell) = self.lstm(packed)

        # Use the final hidden state as our representation
        if self.bidirectional:
            # Concatenate the last hidden state from both directions
            representation = torch.cat((hidden[-2], hidden[-1]), dim=1)
        else:
            representation = hidden[-1]

        return representation

# Build vocabulary from training data
vocab = Vocabulary(min_freq=2)
vocab.build_vocab(train['snip'].tolist())

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
all_channels = pd.concat([train['channel'], val['channel']]).unique()
label_encoder.fit(all_channels)
train_labels = label_encoder.transform(train['channel'])
val_labels = label_encoder.transform(val['channel'])

# Create datasets
train_dataset = TextDataset(train['snip'].tolist(), train_labels, vocab)
val_dataset = TextDataset(val['snip'].tolist(), val_labels, vocab)

# Create dataloaders
batch_size = 32
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_batch
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch
)

# Initialize model
vocab_size = len(vocab.word2idx)
num_classes = len(label_encoder.classes_)
model = RepresentationModel(
    vocab_size=vocab_size,
    num_classes=num_classes,
    embedding_dim=300,
    hidden_dim=256,
    num_layers=2,
    bidirectional=True
).to(device)

# Train the model
def train_model(model, train_dataloader, val_dataloader, device, num_epochs=10):
    """Train the representation learning model with progress bars"""
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    best_val_accuracy = 0.0

    # Create outer progress bar for epochs
    epoch_pbar = tqdm(range(num_epochs), desc="Training Progress", position=0)

    for epoch in epoch_pbar:
        # Training phase
        model.train()
        train_loss = 0.0

        # Create progress bar for training batches
        train_pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]",
                          leave=False, position=1)

        for batch in train_pbar:
            text_indices = batch['text_indices'].to(device)
            text_lengths = batch['text_length']
            labels = batch['label'].to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(text_indices, text_lengths)

            # Calculate loss
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            # Update training progress bar
            train_pbar.set_postfix({"loss": f"{loss.item():.4f}"})

        # Validation phase
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []

        # Create progress bar for validation batches
        val_pbar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]",
                        leave=False, position=1)

        with torch.no_grad():
            for batch in val_pbar:
                text_indices = batch['text_indices'].to(device)
                text_lengths = batch['text_length']
                labels = batch['label'].to(device)

                # Forward pass
                outputs = model(text_indices, text_lengths)

                # Calculate loss
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                # Get predictions
                _, preds = torch.max(outputs, 1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                # Update validation progress bar
                val_pbar.set_postfix({"loss": f"{loss.item():.4f}"})

        # Calculate metrics
        val_accuracy = accuracy_score(all_labels, all_preds)

        # Update epoch progress bar
        epoch_pbar.set_postfix({
            "Train Loss": f"{train_loss/len(train_dataloader):.4f}",
            "Val Loss": f"{val_loss/len(val_dataloader):.4f}",
            "Val Acc": f"{val_accuracy:.4f}"
        })

        # Print progress
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss/len(train_dataloader):.4f}")
        print(f"Val Loss: {val_loss/len(val_dataloader):.4f}")
        print(f"Val Accuracy: {val_accuracy:.4f}")

        # Save best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_representation_model.pt')
            print(f"Model saved with Accuracy: {val_accuracy:.4f}")

    # Load best model
    model.load_state_dict(torch.load('best_representation_model.pt'))
    return model
model = train_model(model, train_dataloader, val_dataloader, device, num_epochs=10)

# Extract features function
def extract_features(model, dataloader, device):
    model.eval()  # Set model to evaluation mode
    features = []
    labels = []

    # Create progress bar for feature extraction
    pbar = tqdm(dataloader, desc="Extracting Features", position=0)

    with torch.no_grad():
        for batch in pbar:
            # Move batch to device
            text_indices = batch['text_indices'].to(device)
            text_lengths = batch['text_length']
            batch_labels = batch['label']

            # Get the representation (features) from the model
            batch_features = model.get_representation(text_indices, text_lengths)

            # Move to CPU and convert to numpy
            features.append(batch_features.cpu().numpy())
            labels.append(batch_labels.numpy())

    return np.vstack(features), np.concatenate(labels)

# Extract features from both training and validation sets
print("Extracting features from the neural network...")
train_features, train_labels = extract_features(model, train_dataloader, device)
val_features, val_labels = extract_features(model, val_dataloader, device)

# Train a logistic regression model on these features
print("Training logistic regression on neural network features...")
lr_model = LogisticRegression(solver='saga', max_iter=1000, C=1.0, class_weight='balanced')
lr_model.fit(train_features, train_labels)

# Evaluate the model
val_predictions = lr_model.predict(val_features)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Logistic Regression Accuracy on extracted features: {val_accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(val_labels, val_predictions))

Using device: cuda
Building vocabulary...
Vocabulary built with 50000 words


Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10 [Train]:   0%|          | 0/622 [00:00<?, ?it/s]

Epoch 1/10 [Val]:   0%|          | 0/95 [00:00<?, ?it/s]


Epoch 1/10
Train Loss: 2.2277
Val Loss: 2.2102
Val Accuracy: 0.3090
Model saved with Accuracy: 0.3090


Epoch 2/10 [Train]:   0%|          | 0/622 [00:00<?, ?it/s]

Epoch 2/10 [Val]:   0%|          | 0/95 [00:00<?, ?it/s]


Epoch 2/10
Train Loss: 1.3805
Val Loss: 2.0986
Val Accuracy: 0.3866
Model saved with Accuracy: 0.3866


Epoch 3/10 [Train]:   0%|          | 0/622 [00:00<?, ?it/s]

Epoch 3/10 [Val]:   0%|          | 0/95 [00:00<?, ?it/s]


Epoch 3/10
Train Loss: 0.9073
Val Loss: 1.9367
Val Accuracy: 0.4363
Model saved with Accuracy: 0.4363


Epoch 4/10 [Train]:   0%|          | 0/622 [00:00<?, ?it/s]

Epoch 4/10 [Val]:   0%|          | 0/95 [00:00<?, ?it/s]


Epoch 4/10
Train Loss: 0.5369
Val Loss: 2.0792
Val Accuracy: 0.4258


Epoch 5/10 [Train]:   0%|          | 0/622 [00:00<?, ?it/s]

Epoch 5/10 [Val]:   0%|          | 0/95 [00:00<?, ?it/s]


Epoch 5/10
Train Loss: 0.2656
Val Loss: 2.2576
Val Accuracy: 0.4603
Model saved with Accuracy: 0.4603


Epoch 6/10 [Train]:   0%|          | 0/622 [00:00<?, ?it/s]

Epoch 6/10 [Val]:   0%|          | 0/95 [00:00<?, ?it/s]


Epoch 6/10
Train Loss: 0.1294
Val Loss: 2.4873
Val Accuracy: 0.4521


Epoch 7/10 [Train]:   0%|          | 0/622 [00:00<?, ?it/s]

Epoch 7/10 [Val]:   0%|          | 0/95 [00:00<?, ?it/s]


Epoch 7/10
Train Loss: 0.0933
Val Loss: 2.8038
Val Accuracy: 0.4330


Epoch 8/10 [Train]:   0%|          | 0/622 [00:00<?, ?it/s]

Epoch 8/10 [Val]:   0%|          | 0/95 [00:00<?, ?it/s]


Epoch 8/10
Train Loss: 0.0911
Val Loss: 2.7109
Val Accuracy: 0.4521


Epoch 9/10 [Train]:   0%|          | 0/622 [00:00<?, ?it/s]

Epoch 9/10 [Val]:   0%|          | 0/95 [00:00<?, ?it/s]


Epoch 9/10
Train Loss: 0.0526
Val Loss: 3.0405
Val Accuracy: 0.4432


Epoch 10/10 [Train]:   0%|          | 0/622 [00:00<?, ?it/s]

Epoch 10/10 [Val]:   0%|          | 0/95 [00:00<?, ?it/s]


Epoch 10/10
Train Loss: 0.0763
Val Loss: 3.0237
Val Accuracy: 0.4383
Extracting features from the neural network...


Extracting Features:   0%|          | 0/622 [00:00<?, ?it/s]

Extracting Features:   0%|          | 0/95 [00:00<?, ?it/s]

Training logistic regression on neural network features...
Logistic Regression Accuracy on extracted features: 0.4505
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.45      0.17      0.25        29
           2       0.74      0.78      0.76       271
           3       0.00      0.00      0.00         0
           4       0.56      0.73      0.63       170
           5       0.62      0.62      0.62       256
           6       0.48      0.38      0.42       219
           7       0.00      0.00      0.00         0
           8       0.40      0.39      0.40       204
           9       0.24      0.35      0.28       165
          10       0.23      0.06      0.10       177
          11       0.57      0.61      0.59        46
          12       0.62      0.64      0.63       214
          13       0.38      0.42      0.40       250
          14       0.18      0.50      0.27     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#measuring accuracy
from sklearn.metrics import accuracy_score
true = val['channel'].to_numpy()
accuracy_cv = accuracy_score(true, predictions_cv)
print(f"The accuracy of the Count Vectorizer Logistic Regressor is {accuracy_cv}")
accuracy_tv = accuracy_score(true, predictions_tv)
print(f"The accuracy of the Tfdif Vectorizer Logistic Regressor is {accuracy_tv}")

from sklearn.metrics import classification_report
print(f"The per-class accuracy of the Count Vectorizer Logistic Regressor is as below:")
print(classification_report(true, predictions_cv, zero_division = np.nan))
print(f"The per-class accuracy of the Tfdif Vectorizer Logistic Regressor is as below:")
print(classification_report(true, predictions_tv, zero_division = np.nan))

print("\nComparison with previous models:")
print(f"Neural Network + Logistic Regression: {val_accuracy:.4f}")
print(f"Count Vectorizer + Logistic Regression: {accuracy_cv}")
print(f"TF-IDF Vectorizer + Logistic Regression: {accuracy_tv}")

The accuracy of the Count Vectorizer Logistic Regressor is 0.5143139190523198
The accuracy of the Tfdif Vectorizer Logistic Regressor is 0.5235274761434683
The per-class accuracy of the Count Vectorizer Logistic Regressor is as below:
              precision    recall  f1-score   support

         1TV       0.00       nan      0.00         0
       ALJAZ       0.67      0.34      0.45        29
     BBCNEWS       0.62      0.62      0.62       271
   BLOOMBERG       0.66      0.82      0.73       170
        CNBC       0.58      0.69      0.63       256
        CNNW       0.44      0.58      0.50       219
       CSPAN       0.43      0.50      0.46       204
      CSPAN2       0.26      0.38      0.30       165
      CSPAN3       0.29      0.10      0.14       177
          DW       0.67      0.30      0.42        46
         FBC       0.62      0.68      0.65       214
    FOXNEWSW       0.55      0.48      0.51       250
         GBN       0.21      0.79      0.33        14
        