### ========== 1. Import Libraries ==========

In [1]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
tqdm.pandas()  # Enable progress bars for pandas operations
import re
import unicodedata
import torch
from sklearn.metrics import accuracy_score
from torch.optim import Adam
from transformers import AutoTokenizer, BertModel

In [2]:
# Set device for PyTorch (use GPU if available)

device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

Using device: mps


### ========== 2. Load and Explore Dataset ==========


In [3]:
# Load training and test datasets
train_df = pd.read_csv("train.csv", usecols=["id", "text", "target"])
test_df = pd.read_csv("test.csv", usecols=["id", "text"])

print("Training data preview:")
train_df.head()

Training data preview:


Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_df.shape

(7613, 3)

### ========== 3. Preprocess Text Data ==========


In [5]:
# Function to clean and preprocess text
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r'\S*@\S*', '', text)  # Remove emails
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')  # Remove accented characters
    return text

# Apply preprocessing to a specified column in a DataFrame
def text_preprocessing(df, col_name):
    df[col_name] = df[col_name].progress_apply(preprocess_text)
    return df

# Preprocess training data
train_df = text_preprocessing(train_df, "text")
print("Data after preprocessing:")
print(train_df.head())

100%|██████████| 7613/7613 [00:00<00:00, 85248.40it/s]

Data after preprocessing:
   id                                               text  target
0   1  our deeds are the reason of this earthquake ma...       1
1   4              forest fire near la ronge sask canada       1
2   5  all residents asked to shelter in place are be...       1
3   6  13000 people receive wildfires evacuation orde...       1
4   7  just got sent this photo from ruby alaska as s...       1





#### ========= 4. Tokenization and Input Preparation =========

In [6]:
# Load pre-trained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

# Define maximum token length based on data
max_len = 36
print(f"Maximum token length: {max_len}")

# Tokenize training text data
x_train = tokenizer(
    text=train_df.text.tolist(),
    add_special_tokens=True,  # Add [CLS] and [SEP] tokens
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors="pt",
    return_attention_mask=True,
)

# Prepare labels for training
y_train = train_df.target.values



Maximum token length: 36


### ========== 5. Define the BERT Classifier ==========

In [7]:
# Import necessary libraries
import torch.nn as nn
from transformers import BertModel

# Define the custom BERT classifier model
class BertClassifier(nn.Module):
    def __init__(self, max_len=36):
        # Initialize the parent class (nn.Module)
        super(BertClassifier, self).__init__()

        # Load the pre-trained BERT model from Hugging Face
        # Here, 'bert-large-uncased' is used for higher accuracy and more parameters
        self.bert = BertModel.from_pretrained("bert-large-uncased")

        # Define a dropout layer to reduce overfitting
        self.dropout = nn.Dropout(0.1)

        # Define the first fully connected layer
        # This layer reduces the output size from BERT's hidden size to 128
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 128)

        # Define the second fully connected layer
        # It reduces the size further from 128 to 32
        self.fc2 = nn.Linear(128, 32)

        # Define the third fully connected layer
        # Since this is a binary classification task, the output size is 1
        self.fc3 = nn.Linear(32, 1)

        # Define a ReLU activation function
        # ReLU (Rectified Linear Unit) is used to introduce non-linearity
        self.relu = nn.ReLU()

        # Define a sigmoid activation function
        # This is used to output probabilities for binary classification
        self.sigmoid = nn.Sigmoid()
    
    # Define the forward pass of the model
    def forward(self, input_ids, attention_mask):
        # Pass the input through the BERT model
        # BERT outputs two main values: last_hidden_state and pooler_output
        outputs = self.bert(input_ids, attention_mask=attention_mask)

        # Extract the pooler output ([CLS] token representation)
        pooled_output = outputs.pooler_output

        # Apply dropout to the pooled output
        x = self.dropout(pooled_output)

        # Pass through the first fully connected layer with ReLU activation
        x = self.relu(self.fc1(x))

        # Apply dropout again
        x = self.dropout(x)

        # Pass through the second fully connected layer with ReLU activation
        x = self.relu(self.fc2(x))

        # Pass through the third fully connected layer
        # This produces the raw output score
        x = self.fc3(x)

        # Apply sigmoid activation to convert the score into probabilities
        x = self.sigmoid(x)

        return x  # Return the final output
    
# Initialize the model
# max_len is passed as an argument to control the maximum token length
model = BertClassifier(max_len=max_len)

# Move the model to the appropriate device (e.g., GPU or CPU)
model = model.to(device)

# Print a message confirming the model's device
print(f"Model initialized on device: {next(model.parameters()).device}")


Model initialized on device: mps:0


### ========== 6. Training Setup ==========

In [8]:
# Optimizer, Loss Function, and Accuracy Metric
optimizer = Adam(model.parameters(), lr=2e-5)
criterion = nn.BCELoss().to(device)

def calculate_accuracy(predictions, labels):
    # Convert predictions to binary (1 if > 0.5, else 0)    
    pred_labels = (predictions > 0.5).float()
    # Return accuracy score
    return accuracy_score(labels.cpu(), pred_labels.cpu())

# Create a custom dataset for training
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        # Initialize the dataset with input data and labels
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = torch.tensor(labels, dtype=torch.float).to(device)

    def __len__(self):
        # Return the number of samples
        return len(self.input_ids)
    
    def __getitem__(self, index):
        # Get data for a specific index
        return (self.input_ids[index], self.attention_mask[index], self.labels[index])

# Initialize training dataset and dataloader
train_dataset = CustomDataset(x_train["input_ids"], x_train["attention_mask"], y_train)
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)

### ========== 7. Train the Model ==========

In [9]:
# Set number of epochs
epochs = 0

# Loop over each epoch
for epoch in range(epochs):
    model.train()  # Set model to training mode
    total_loss, correct_predictions, total_samples = 0, 0, 0  # Initialize metrics

    # Loop over batches in the training dataloader
    for input_ids, attention_mask, labels in train_dataloader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass: Get model outputs
        outputs = model(input_ids, attention_mask)

        # Compute loss
        loss = criterion(outputs.squeeze(), labels)

        # Backward pass: Calculate gradients
        loss.backward()

        # Update model parameters
        optimizer.step()

        # Track loss and accuracy
        total_loss += loss.item()
        correct_predictions += (outputs.squeeze() > 0.5).float().eq(labels).sum().item()
        total_samples += labels.size(0)

    # Print loss and accuracy after each epoch
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(train_dataloader):.4f}, Accuracy: {correct_predictions/total_samples:.2%}")

#### ========== 8. Test Data Preparation and Prediction ==========


In [10]:
# Preprocess test data (e.g., cleaning or removing unwanted characters)
test_df = text_preprocessing(test_df, "text")

# Tokenize test data
x_test = tokenizer(
    text=test_df.text.tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors="pt",
    return_attention_mask=True,
)

# Create test dataset and dataloader
class TestDataset(Dataset):
    def __init__(self, input_ids, attention_mask):
        self.input_ids = input_ids
        self.attention_mask = attention_mask

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return (self.input_ids[index], self.attention_mask[index])

# Initialize test dataset and dataloader
test_dataset = TestDataset(x_test["input_ids"], x_test["attention_mask"])
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)


100%|██████████| 3263/3263 [00:00<00:00, 85854.71it/s]


In [None]:
# Predict using the trained model

# Set the model to evaluation mode (disables dropout and batchnorm)
model.eval()

# Initialize a list to store predictions
predictions = []

# No need to calculate gradients for inference (to save memory and computation)
with torch.no_grad():
    # Iterate over batches in the test dataloader
    for input_ids, attention_mask in test_dataloader:
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)  # Move data to device (GPU/CPU)
        
        # Get model predictions (outputs)
        outputs = model(input_ids, attention_mask)
        
        # Store the predictions (squeezing removes extra dimensions)
        predictions.extend(outputs.squeeze().cpu().numpy())

# Convert the model's continuous predictions to binary labels (0 or 1)
y_predicted = np.where(np.array(predictions) > 0.5, 1, 0)

### ========== 9. Save Submission ==========


In [None]:
# Save results to a CSV file
submission = pd.DataFrame({"id": test_df.id, "target": y_predicted})
submission.to_csv("submission.csv", index=False)
print("Submission saved successfully!")