# Below code is for other models - MultiChannelCNN, LSTM and BERT

In [2]:
# MultiChannelCNN

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Conv1D, GlobalMaxPooling1D, concatenate, Dense, Dropout
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, Callback
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
train_data = pd.read_csv("Data/train.csv")
test_data = pd.read_csv("Data/test.csv")

# Data Preprocessing
def preprocess_text(df):
    df['keyword'] = df['keyword'].fillna('')
    df['location'] = df['location'].fillna('')
    df['text'] = df['text'].fillna('')
    df['combined_text'] = df['keyword'] + " " + df['location'] + " " + df['text']
    df['combined_text'] = df['combined_text'].str.replace(r'http\S+', '', regex=True)
    df['combined_text'] = df['combined_text'].str.replace(r'@\w+', '', regex=True)
    df['combined_text'] = df['combined_text'].str.replace(r'#', '', regex=True)
    return df

train_data = preprocess_text(train_data)
test_data = preprocess_text(test_data)

# Tokenization and Padding
max_words = 20000
max_len = 100
embedding_dim = 128  # Embedding dimension for trainable embeddings

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data['combined_text'])

X = tokenizer.texts_to_sequences(train_data['combined_text'])
X = pad_sequences(X, maxlen=max_len)
y = train_data['target']

test_sequences = tokenizer.texts_to_sequences(test_data['combined_text'])
test_padded = pad_sequences(test_sequences, maxlen=max_len)

# Train-Test Split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom Callback to Compute F1-Score During Training
class F1ScoreCallback(Callback):
    def __init__(self, validation_data):
        self.validation_data = validation_data
        self.f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        X_val, y_val = self.validation_data
        y_pred_prob = self.model.predict(X_val).flatten()
        y_pred = (y_pred_prob > 0.5).astype(int)
        f1 = f1_score(y_val, y_pred)
        self.f1_scores.append(f1)
        print(f"Epoch {epoch + 1} F1-Score: {f1:.4f}")

# Hyperparameters
filters = 256
kernel_sizes = [3, 5, 7]
dropout_rate = 0.75
learning_rate = 0.00005
batch_size = 32
epochs = 10

# Build the Multi-Channel CNN Model
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len)(input_layer)

# Convolutional layers with multiple kernel sizes
conv_blocks = []
for kernel_size in kernel_sizes:
    conv = Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(embedding_layer)
    pool = GlobalMaxPooling1D()(conv)
    conv_blocks.append(pool)

# Concatenate pooled outputs
concat = concatenate(conv_blocks)

# Fully connected layers
dense1 = Dense(128, activation='relu')(concat)
dropout = Dropout(dropout_rate)(dense1)
output_layer = Dense(1, activation='sigmoid')(dropout)

# Define and Compile the Model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

# Early stopping and F1-score callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
f1_callback = F1ScoreCallback(validation_data=(X_valid, y_valid))

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping, f1_callback],
    verbose=1
)

# Evaluate the Model
y_pred_prob = model.predict(X_valid).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
print("\nClassification Report:")
print(classification_report(y_valid, y_pred))

# Plot Training History (Accuracy, Loss, and F1-Score)
plt.figure(figsize=(12, 4))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Training and Validation Accuracy")
plt.show()

plt.figure(figsize=(12, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Training and Validation Loss")
plt.show()

plt.figure(figsize=(12, 4))
plt.plot(f1_callback.f1_scores, label='Validation F1-Score')
plt.xlabel("Epochs")
plt.ylabel("F1-Score")
plt.legend()
plt.title("Validation F1-Score per Epoch")
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'Data/train.csv'

In [None]:
#  RNN and LSTM

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, Callback
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2

# Refined Hyperparameters
MAX_WORDS = 20000         # Maximum number of words in the vocabulary
MAX_LEN = 100             # Maximum length of input sequences
EMBEDDING_DIM = 128       # Dimensionality of the embedding layer
LSTM_UNITS_1 = 64         # Reduced units in the first LSTM layer
LSTM_UNITS_2 = 32         # Reduced units in the second LSTM layer
DENSE_UNITS = 32          # Reduced units in the dense layer
DROPOUT_RATE = 0.70        # Increased dropout rate for regularization
LEARNING_RATE = 0.00007    # Reduced learning rate for finer updates
BATCH_SIZE = 32           # Batch size for training
EPOCHS = 5                # Number of training epochs
EARLY_STOPPING_PATIENCE = 2  # Early stopping patience to prevent overfitting

# Load the dataset
train_data = pd.read_csv("Data/train.csv")
test_data = pd.read_csv("Data/test.csv")

# Data Preprocessing
def preprocess_text(df):
    df['keyword'] = df['keyword'].fillna('')
    df['location'] = df['location'].fillna('')
    df['text'] = df['text'].fillna('')
    df['combined_text'] = df['keyword'] + " " + df['location'] + " " + df['text']
    df['combined_text'] = df['combined_text'].str.replace(r'http\S+', '', regex=True)
    df['combined_text'] = df['combined_text'].str.replace(r'@\w+', '', regex=True)
    df['combined_text'] = df['combined_text'].str.replace(r'#', '', regex=True)
    return df

train_data = preprocess_text(train_data)
test_data = preprocess_text(test_data)

# Tokenization and Padding
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train_data['combined_text'])

X = tokenizer.texts_to_sequences(train_data['combined_text'])
X = pad_sequences(X, maxlen=MAX_LEN)
y = train_data['target']

test_sequences = tokenizer.texts_to_sequences(test_data['combined_text'])
test_padded = pad_sequences(test_sequences, maxlen=MAX_LEN)

# Train-Test Split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom Callback to Compute F1-Score During Training
class F1ScoreCallback(Callback):
    def __init__(self, validation_data):
        self.validation_data = validation_data
        self.f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        X_val, y_val = self.validation_data
        y_pred_prob = self.model.predict(X_val).flatten()
        y_pred = (y_pred_prob > 0.5).astype(int)
        f1 = f1_score(y_val, y_pred)
        self.f1_scores.append(f1)
        print(f"Epoch {epoch + 1} F1-Score: {f1:.4f}")

# Build the Refined RNN with LSTM Model
model = Sequential()
model.add(Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_LEN))
model.add(Bidirectional(LSTM(LSTM_UNITS_1, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)))
model.add(Bidirectional(LSTM(LSTM_UNITS_2, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(DENSE_UNITS, activation='relu', kernel_regularizer=l2(0.01)))  # Added L2 regularization
model.add(Dropout(DROPOUT_RATE))
model.add(Dense(1, activation='sigmoid'))

# Compile the Model
model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the Model
model.summary()

# Early stopping and F1-score callback
early_stopping = EarlyStopping(monitor='val_loss', patience=EARLY_STOPPING_PATIENCE, restore_best_weights=True)
f1_callback = F1ScoreCallback(validation_data=(X_valid, y_valid))

# Train the Model
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stopping, f1_callback],
    verbose=1
)

# Evaluate the Model
y_pred_prob = model.predict(X_valid).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)
print("\nClassification Report:")
print(classification_report(y_valid, y_pred))

# Plot Training History (Accuracy, Loss, and F1-Score)
plt.figure(figsize=(12, 4))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Training and Validation Accuracy")
plt.show()

plt.figure(figsize=(12, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Training and Validation Loss")
plt.show()

plt.figure(figsize=(12, 4))
plt.plot(f1_callback.f1_scores, label='Validation F1-Score')
plt.xlabel("Epochs")
plt.ylabel("F1-Score")
plt.legend()
plt.title("Validation F1-Score per Epoch")
plt.show()

In [None]:
# BERT


# pip install transformers datasets scikit-learn torch tqdm matplotlib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_scheduler
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Hyperparameters
MODEL_NAME = "bert-base-uncased"  # Pretrained BERT model
MAX_LEN = 128                    # Maximum sequence length
BATCH_SIZE = 16                  # Batch size
LEARNING_RATE = 2e-6             # Learning rate
EPOCHS = 3                       # Number of training epochs
EPS = 1e-8                       # AdamW optimizer epsilon

# Load the dataset
train_data = pd.read_csv("Data/train.csv")
test_data = pd.read_csv("Data/test.csv")

# Preprocessing
def preprocess_text(df):
    df['keyword'] = df['keyword'].fillna('')
    df['location'] = df['location'].fillna('')
    df['text'] = df['text'].fillna('')
    df['combined_text'] = df['keyword'] + " " + df['location'] + " " + df['text']
    return df

train_data = preprocess_text(train_data)
test_data = preprocess_text(test_data)

# Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    train_data['combined_text'], train_data['target'], test_size=0.2, random_state=42
)

# Tokenizer and Model Initialization
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Dataset Class
class TweetDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        item = {key: val.squeeze(0) for key, val in inputs.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[index], dtype=torch.long)
        return item

# Create Dataset and DataLoader
train_dataset = TweetDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_len=MAX_LEN)
valid_dataset = TweetDataset(X_valid.tolist(), y_valid.tolist(), tokenizer, max_len=MAX_LEN)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

# Load Pretrained BERT Model
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model = model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPS)
num_training_steps = len(train_dataloader) * EPOCHS
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Enhanced Training Loop with Training and Validation Metrics Tracking
def train_model(model, train_dataloader, valid_dataloader, optimizer, lr_scheduler, device, epochs=3):
    training_loss_history = []
    validation_loss_history = []
    training_accuracy_history = []
    validation_accuracy_history = []
    training_f1_history = []
    validation_f1_history = []

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        
        # Training Phase
        model.train()
        train_loss = 0.0
        train_preds = []
        train_labels = []
        loop = tqdm(train_dataloader, leave=True)
        for batch in loop:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            train_loss += loss.item()
            
            logits = outputs.logits
            train_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            train_labels.extend(batch["labels"].cpu().numpy())
            
            loop.set_description(f"Epoch {epoch + 1}")
            loop.set_postfix(loss=loss.item())
        
        # Compute Training Metrics
        training_loss_history.append(train_loss / len(train_dataloader))
        train_accuracy = np.mean(np.array(train_preds) == np.array(train_labels))
        train_f1 = classification_report(train_labels, train_preds, output_dict=True)['weighted avg']['f1-score']
        training_accuracy_history.append(train_accuracy)
        training_f1_history.append(train_f1)
        print(f"Training Loss: {train_loss / len(train_dataloader):.4f}")
        print(f"Training Accuracy: {train_accuracy:.4f}")
        print(f"Training F1-Score: {train_f1:.4f}")

        # Validation Phase
        model.eval()
        valid_loss = 0.0
        valid_preds = []
        valid_labels = []
        with torch.no_grad():
            for batch in valid_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                valid_loss += outputs.loss.item()
                logits = outputs.logits
                valid_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
                valid_labels.extend(batch["labels"].cpu().numpy())
        
        # Compute Validation Metrics
        validation_loss_history.append(valid_loss / len(valid_dataloader))
        valid_accuracy = np.mean(np.array(valid_preds) == np.array(valid_labels))
        valid_f1 = classification_report(valid_labels, valid_preds, output_dict=True)['weighted avg']['f1-score']
        validation_accuracy_history.append(valid_accuracy)
        validation_f1_history.append(valid_f1)
        print(f"Validation Loss: {valid_loss / len(valid_dataloader):.4f}")
        print(f"Validation Accuracy: {valid_accuracy:.4f}")
        print(f"Validation F1-Score: {valid_f1:.4f}")

    return (
        training_loss_history,
        validation_loss_history,
        training_accuracy_history,
        validation_accuracy_history,
        training_f1_history,
        validation_f1_history,
    )

# Train the Model
(
    training_loss_history,
    validation_loss_history,
    training_accuracy_history,
    validation_accuracy_history,
    training_f1_history,
    validation_f1_history,
) = train_model(model, train_dataloader, valid_dataloader, optimizer, lr_scheduler, device, epochs=EPOCHS)

# Plot Training and Validation Loss
plt.figure(figsize=(12, 4))
plt.plot(training_loss_history, label='Train Loss')
plt.plot(validation_loss_history, label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Training and Validation Loss")
plt.show()

# Plot Training and Validation Accuracy
plt.figure(figsize=(12, 4))
plt.plot(training_accuracy_history, label='Train Accuracy')
plt.plot(validation_accuracy_history, label='Validation Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Training and Validation Accuracy")
plt.show()

# Plot Training and Validation F1-Score
plt.figure(figsize=(12, 4))
plt.plot(training_f1_history, label='Train F1-Score')
plt.plot(validation_f1_history, label='Validation F1-Score')
plt.xlabel("Epochs")
plt.ylabel("F1-Score")
plt.legend()
plt.title("Training and Validation F1-Score")
plt.show()