In [4]:
# This command installs the 'transformers' library, which is a popular library developed by Hugging Face. 
# It provides pre-trained models and tools for natural language processing (NLP) tasks, including BERT, GPT, and others.
!pip install transformers

# This command installs the 'torch' library, which is also known as PyTorch. 
# PyTorch is a deep learning framework that provides a flexible and efficient platform for building and training neural networks.
!pip install torch





In [8]:
# Import the PyTorch library, which provides tools for building and training neural networks.
import torch

# Import Dataset and DataLoader classes from torch.utils.data to handle data loading and batching.
from torch.utils.data import Dataset, DataLoader

# Import BertTokenizer and BertForSequenceClassification from the transformers library 
# to utilize BERT for tokenization and sequence classification tasks.
from transformers import BertTokenizer, BertForSequenceClassification

# Import numpy for numerical operations and linear algebra.
import numpy as np

# Import pandas for data manipulation and analysis.
import pandas as pd


In [2]:
# Load the training dataset from the specified CSV file using pandas.
data = pd.read_csv('/kaggle/input/lady-end-of-year/data-ladies-end-of-year-hackathon/train_dataset.csv')

# Load the sample submission dataset from the specified CSV file using pandas.
sub = pd.read_csv('/kaggle/input/lady-end-of-year/data-ladies-end-of-year-hackathon/sample_submission_dataset.csv')

# Load the test dataset from the specified CSV file using pandas.
test = pd.read_csv('/kaggle/input/lady-end-of-year/data-ladies-end-of-year-hackathon/test_dataset.csv')


In [3]:
# Display the first few rows of the training dataset to understand its structure and contents.
data.head()

Unnamed: 0,headline,clickbait,ID
0,"Hussein enters ""not guilty"" plea at trial",0,84698cc7-8ae2-4ea3-a425-b7091561cee6
1,Iraq peace talks draw to a close in Finland,0,a4e35ca4-15fa-43e8-b68e-91457b23afee
2,British Premier Visits Northern Ireland,0,162991ee-ea2f-41ad-a753-649a68f54311
3,The Decline of Left-Handed First Basemen,0,2cd3aa32-6ec2-4af1-bd1d-560709066b8b
4,Who Said It: Donald Trump Or Kanye West,1,72553370-c348-4603-882b-39e04b610c39
...,...,...,...
30395,I Was Added To A Group Chat With Eight Strange...,1,5e583898-902d-42fc-b076-9d015cd932b1
30396,20 Tweets That Capture The Internet's Reaction...,1,11e5e1a0-ed9c-4403-a59e-62282d6a33c1
30397,Zach Johnson Wins Sony Open,0,45196512-5f6b-4814-90d6-4fae1dc07a4d
30398,Finding Your Way Through a Maze of Smartphones,0,5c01cff5-6bdd-4dd1-bc3e-27134cc4f2f3


In [12]:
# Display concise summary of the training dataset, including the data types of each column and the number of non-null entries.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30400 entries, 0 to 30399
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headline   30400 non-null  object
 1   clickbait  30400 non-null  int64 
 2   ID         30400 non-null  object
dtypes: int64(1), object(2)
memory usage: 712.6+ KB


In [13]:
# Generate descriptive statistics for the numerical columns in the training dataset.
data.describe()

<bound method NDFrame.describe of                                                 headline  clickbait  \
0              Hussein enters "not guilty" plea at trial          0   
1            Iraq peace talks draw to a close in Finland          0   
2                British Premier Visits Northern Ireland          0   
3               The Decline of Left-Handed First Basemen          0   
4                Who Said It: Donald Trump Or Kanye West          1   
...                                                  ...        ...   
30395  I Was Added To A Group Chat With Eight Strange...          1   
30396  20 Tweets That Capture The Internet's Reaction...          1   
30397                        Zach Johnson Wins Sony Open          0   
30398     Finding Your Way Through a Maze of Smartphones          0   
30399  Pete Wentz's Emo Dating Profile Will Make You ...          1   

                                         ID  
0      84698cc7-8ae2-4ea3-a425-b7091561cee6  
1      a4e35ca4-15fa-

In [9]:
# Initialize the BERT tokenizer using the 'bert-base-uncased' model, with lowercasing enabled.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Define a function to tokenize the input headlines using the BERT tokenizer.
def tokenize_headlines(headlines):
    return tokenizer(headlines, padding=True, truncation=True, return_tensors='pt')

# Tokenize the headlines in the training data.
tokenized_data = tokenize_headlines(data['headline'].tolist())

# Tokenize the headlines in the test data.
tokenized_test = tokenize_headlines(test['headline'].tolist())


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
# Check model structure
model

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
# Define a custom dataset for PyTorch to handle the headline data

class HeadlineDataset(Dataset):
    def __init__(self, encodings, labels=None):
        """
        Initialize the dataset with encoded inputs and optional labels.

        Args:
            encodings (dict): The encoded input data.
            labels (list, optional): The corresponding labels for the input data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Retrieve an item from the dataset at the specified index.

        Args:
            idx (int): The index of the item to retrieve.

        Returns:
            dict: A dictionary containing the input tensors and optionally the label tensor.
        """
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        """
        Return the number of items in the dataset.

        Returns:
            int: The length of the dataset.
        """
        return len(self.encodings['input_ids'])

# Instantiate the dataset for training
train_dataset = HeadlineDataset(tokenized_data, data['clickbait'].tolist())


In [16]:
from transformers import AdamW
from torch.utils.data import DataLoader
import torch

# Initialize a DataLoader for the training dataset
# The DataLoader is set up with a batch size of 16 and shuffling enabled to ensure that data is randomly sampled during training
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Set up the AdamW optimizer
# The optimizer is configured with a learning rate of 1e-5 and an epsilon value of 3e-10 for numerical stability
optimizer = AdamW(model.parameters(), eps=3e-10, lr=1e-5)

# Determine the device to use for model training
# The device will be set to GPU if available, otherwise it defaults to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Move the model to the selected device




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [17]:
# Training the model

# Move the model to the appropriate device (GPU or CPU)
model.to(device)

# Set the model to training mode to enable dropout and batch normalization
model.train()

# Training loop for a specified number of epochs
for epoch in range(50):  # Total number of epochs for training
    total_loss = 0  # Initialize the total loss for this epoch

    # Iterate over the batches in the training data
    for batch in train_loader:
        # Move batch data to the appropriate device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Clear previously calculated gradients
        optimizer.zero_grad()

        # Perform a forward pass through the model
        outputs = model(**batch)

        # Extract the loss value from the model outputs
        loss = outputs.loss
        total_loss += loss.item()  # Accumulate the loss
        loss.backward()  # Perform a backward pass to compute gradients

        # Update the model parameters
        optimizer.step()

    # Print the average loss for the current epoch
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1, Loss: 0.055454948335536755


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Evaluate the model
model.eval()

# Prepare validation set (splitting from training data)
train_texts, val_texts, train_labels, val_labels = train_test_split(data['headline'], data['clickbait'], test_size=0.1)

# Tokenize validation texts
tokenized_val = tokenize_headlines(val_texts.tolist())
val_dataset = HeadlineDataset(tokenized_val, val_labels.tolist())
val_loader = DataLoader(val_dataset, batch_size=4)

# Calculate F1 score on validation set
model.to(device)
total_eval_f1 = 0
for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    total_eval_f1 += f1_score(batch['labels'].cpu(), predictions.cpu(), average='weighted')

print(f'F1 Score on Validation Set: {total_eval_f1 / len(val_loader)}')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

F1 Score on Validation Set: 0.9944736842105263


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [20]:
# Predicting on the test set
model.eval()

# Tokenize test headlines
tokenized_test = tokenize_headlines(test['headline'].tolist())
test_dataset = HeadlineDataset(tokenized_test)
test_loader = DataLoader(test_dataset, batch_size=32)

# Initialize list to store predictions
predictions = []

# Predict
for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    batch_predictions = torch.argmax(logits, dim=-1)
    predictions.extend(batch_predictions.cpu().numpy())

# Create submission file
submission = pd.DataFrame({'ID': test['ID'], 'TARGET': predictions})
submission.to_csv('sample_prediction_on_new_dataset.csv', index=False)
print('Sample Prediction Successful')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Submission file created successfully.
