In [6]:
# !pip install torch
# !pip install pandas
!pip install datasets
# !pip install scikit-learn==1.3.0  # Install scikit-learn for metrics calculation


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [7]:
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from datasets import Dataset
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers import AdamW
from transformers import Trainer, TrainingArguments





In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    """
    Computes and returns a dictionary of metrics (accuracy, precision, recall, F1-score).
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=1)

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset from the specified CSV file
raw_datasets = pd.read_csv("/content/drive/MyDrive/nlp/clickbait_data.csv")


In [None]:
print(raw_datasets)

                                                headline  clickbait
0                                     Should I Get Bings          1
1          Which TV Female Friend Group Do You Belong In          1
2      The New "Star Wars: The Force Awakens" Trailer...          1
3      This Vine Of New York On "Celebrity Big Brothe...          1
4      A Couple Did A Stunning Photo Shoot With Their...          1
...                                                  ...        ...
31995  To Make Female Hearts Flutter in Iraq, Throw a...          0
31996  British Liberal Democrat Patsy Calton, 56, die...          0
31997  Drone smartphone app to help heart attack vict...          0
31998  Netanyahu Urges Pope Benedict, in Israel, to D...          0
31999  Computer Makers Prepare to Stake Bigger Claim ...          0

[32000 rows x 2 columns]


In [None]:
df = pd.DataFrame(raw_datasets, columns=["headline", "clickbait"])



In [None]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the shuffled DataFrame
print(df)

                                                headline  clickbait
0      Filipino activist arrested for disrupting Mani...          0
1      International Board fixes soccer field size, h...          0
2          24 Rules For Women On A First Date With A Man          1
3      Political fallout from the sacking of Professo...          0
4      Which "Clueless" Character Are You Based On Yo...          1
...                                                  ...        ...
31995  Rocket strike near hotel in Afghan capital inj...          0
31996  How Well Do You Remember The First Episode Of ...          1
31997  16 Photos From The Delhi Queer Pride Parade Th...          1
31998             33 Of The Most Canadian Sentences Ever          1
31999  Man killed after shop robbery in West Yorkshir...          0

[32000 rows x 2 columns]


In [None]:
# Step 1: Clean text (lowercase, remove special characters, normalize spaces)
df['headline'] = df['headline'].str.lower()  # Convert to lowercase
df['headline'] = df['headline'].str.replace(r'[^a-z0-9\s]', '', regex=True)  # Remove special characters
df['headline'] = df['headline'].str.replace(r'\s+', ' ', regex=True)  # Normalize multiple spaces

# Step 2: Split into DatasetDict format (to be used with Hugging Face's `datasets` library)
# Convert the cleaned dataframe into a Dataset object for easy tokenization with Hugging Face


In [None]:
df

Unnamed: 0,headline,clickbait
0,filipino activist arrested for disrupting mani...,0
1,international board fixes soccer field size ha...,0
2,24 rules for women on a first date with a man,1
3,political fallout from the sacking of professo...,0
4,which clueless character are you based on your...,1
...,...,...
31995,rocket strike near hotel in afghan capital inj...,0
31996,how well do you remember the first episode of ...,1
31997,16 photos from the delhi queer pride parade th...,1
31998,33 of the most canadian sentences ever,1


In [None]:
pre_processed_dataset = Dataset.from_pandas(df)
pre_processed_dataset

Dataset({
    features: ['headline', 'clickbait'],
    num_rows: 32000
})

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #Downloads the base version of BERT trained on lowercase English text (e.g., "hello" and "Hello" are treated the same).
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) #Configures the model to handle a binary classification problem  "clickbait" vs. "non-clickbait".


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(batch):
    # Tokenize the 'headline' column
    return tokenizer(batch['headline'], truncation=True, padding=True, max_length=512)

In [None]:
tokenized_datasets = pre_processed_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets)

Dataset({
    features: ['headline', 'clickbait', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 32000
})


In [None]:
tokenized_datasets = tokenized_datasets.rename_column('clickbait', 'labels')

In [None]:
split_datasets = tokenized_datasets.train_test_split(test_size=0.2)

# Further split the train data into train and validation (80% train, 20% validation)
train_val_split = split_datasets['train'].train_test_split(test_size=0.2)

# Access the splits
train_dataset = train_val_split['train']
validation_dataset = train_val_split['test']
test_dataset = split_datasets['test']

In [None]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['headline', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25600
    })
    test: Dataset({
        features: ['headline', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6400
    })
})

In [None]:
train_val_split

DatasetDict({
    train: Dataset({
        features: ['headline', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20480
    })
    test: Dataset({
        features: ['headline', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5120
    })
})

In [None]:
test_dataset

Dataset({
    features: ['headline', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6400
})

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator
)

In [None]:
#AdamW: A type of optimizer that updates the model’s weights during training to minimize the loss.
#lr=5e-5: Sets the learning rate to 0.00005, controlling how much the model adjusts weights during training.

optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    """
    Computes and returns a dictionary of metrics (accuracy, precision, recall, F1-score).
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=1)

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# output_dir: Saves the trained model and logs to a directory named "results."
# evaluation_strategy="epoch": Evaluates the model after every epoch (one pass through the dataset).
# learning_rate: Sets the learning rate to 0.00002.
# num_train_epochs=3: Specifies 3 training iterations through the dataset.
# weight_decay=0.01: Prevents overfitting by slightly penalizing large model weights.

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/nlp/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)




In [None]:
# Creates a Trainer object that automates:
# Training: Feeds the training dataset into the model.
# Evaluation: Tests the model's performance on the validation dataset.

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics

)


  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0627,0.051676,0.986133,0.989864,0.982585,0.986211
2,0.0181,0.069427,0.986328,0.981979,0.991099,0.986518
3,0.0056,0.069584,0.989258,0.990306,0.98839,0.989347


TrainOutput(global_step=3840, training_loss=0.03588072238489985, metrics={'train_runtime': 595.5357, 'train_samples_per_second': 103.168, 'train_steps_per_second': 6.448, 'total_flos': 1035958669377600.0, 'train_loss': 0.03588072238489985, 'epoch': 3.0})

In [None]:
model.save_pretrained("/content/drive/MyDrive/nlp/fine_tuned_bert")
tokenizer.save_pretrained("/content/drive/MyDrive/nlp/fine_tuned_bert")

('/content/drive/MyDrive/nlp/fine_tuned_bert/tokenizer_config.json',
 '/content/drive/MyDrive/nlp/fine_tuned_bert/special_tokens_map.json',
 '/content/drive/MyDrive/nlp/fine_tuned_bert/vocab.txt',
 '/content/drive/MyDrive/nlp/fine_tuned_bert/added_tokens.json')

In [None]:
trainer.save_model("/content/drive/MyDrive/nlp/api_saved_bert")

In [None]:
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)

{'eval_loss': 0.06335476785898209, 'eval_accuracy': 0.99046875, 'eval_precision': 0.9921826141338337, 'eval_recall': 0.9887815518853226, 'eval_f1': 0.9904791634150147, 'eval_runtime': 14.213, 'eval_samples_per_second': 450.292, 'eval_steps_per_second': 28.143, 'epoch': 3.0}


In [None]:
import torch

# Assuming you want to use the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Move your model to the device
model.to(device)

# Move your input tensors to the device
text = "How to get 6 pack abs in 5 days?"
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)

# Now, run the inference
outputs = model(**inputs)
predicted_class = outputs.logits.argmax(dim=1).item()

# Print the prediction
print("Predicted class:", "Clickbait" if predicted_class == 1 else "Non-Clickbait")

Predicted class: Clickbait


##Loading trained model and testing

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from datasets import Dataset
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers import AdamW
from transformers import Trainer, TrainingArguments
import torch

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the saved model and tokenizer
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/nlp/fine_tuned_bert")
tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/nlp/fine_tuned_bert")


OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /content/drive/MyDrive/nlp/fine_tuned_bert.

In [None]:

text = "Is this a clickbait headline?"
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
outputs = model(**inputs)
predicted_class = outputs.logits.argmax(dim=1).item()

# Print the prediction
print("Predicted class:", "Clickbait" if predicted_class == 1 else "Non-Clickbait")
