<a href="https://colab.research.google.com/github/Arunavaonly/Image-Based-Clickbait-Detection-Model-Pipeline/blob/main/thumbnail_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
import requests
from PIL import Image
import csv

dataset = pd.read_excel("BaitBuster-Bangla.xlsx", index_col= False)

final_dataset = dataset[['thumbnail','human_labeled']]
final_dataset = final_dataset.dropna()
final_dataset = final_dataset.sample(frac =1).reset_index(drop=True)
print(final_dataset.value_counts('human_labeled'))
print(final_dataset.head())
print(final_dataset.tail())


# Directory to store downloaded images
os.makedirs("data_dir/train/clickbait", exist_ok=True)
os.makedirs("data_dir/train/non-clickbait", exist_ok =True)

def download_image(url, save_path):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(save_path, "wb") as f:
                f.write(response.content)
    except Exception as e:
        print(f"Error downloading {url}: {e}")

# Example: Iterate over dataset and download images
for index, row in final_dataset.iterrows():  # Assuming dataset is a DataFrame
    thumbnail_url = row["thumbnail"]
    label = row["human_labeled"]
    if label == 'Clickbait':
        save_path = f"data_dir/train/clickbait/{index}.jpg"
        download_image(thumbnail_url, save_path)
    else:
        save_path = f"data_dir/train/non-clickbait/{index}.jpg"
        download_image(thumbnail_url, save_path)



human_labeled
Clickbait        5644
Not Clickbait    4356
Name: count, dtype: int64
                                        thumbnail  human_labeled
0  https://i.ytimg.com/vi/g1mJZh0R2IU/default.jpg  Not Clickbait
1  https://i.ytimg.com/vi/ZtzaXJ_kUVw/default.jpg      Clickbait
2  https://i.ytimg.com/vi/Fw6gnlE6F3w/default.jpg  Not Clickbait
3  https://i.ytimg.com/vi/mZ-DrisVSM4/default.jpg      Clickbait
4  https://i.ytimg.com/vi/vZbFpMxZ_gQ/default.jpg      Clickbait
                                           thumbnail  human_labeled
9995  https://i.ytimg.com/vi/cJOvWTr7fcY/default.jpg      Clickbait
9996  https://i.ytimg.com/vi/5IQDNB_P4k0/default.jpg      Clickbait
9997  https://i.ytimg.com/vi/AN4RkfchXoM/default.jpg      Clickbait
9998  https://i.ytimg.com/vi/hGGaREktoOU/default.jpg  Not Clickbait
9999  https://i.ytimg.com/vi/VEsrmownUYo/default.jpg  Not Clickbait


In [None]:
lst = os.listdir("data_dir/train/clickbait")
lst2 = os.listdir("data_dir/train/non-clickbait")
print(len(lst))
print(len(lst2))

3963
4160


In [None]:
os.makedirs("data_dir/val/clickbait", exist_ok=True)
os.makedirs("data_dir/val/non-clickbait", exist_ok =True)

In [None]:
import os
import shutil

# Define paths
clickbait_dir = "data_dir/train/clickbait"
non_clickbait_dir = "data_dir/train/non-clickbait"
val_clickbait_dir = "data_dir/val/clickbait"
val_non_clickbait_dir = "data_dir/val/non-clickbait"

# Get lists of files
clickbait_files = os.listdir(clickbait_dir)
non_clickbait_files = os.listdir(non_clickbait_dir)

# Select 800 images from each category
clickbait_to_move = clickbait_files[:800]
non_clickbait_to_move = non_clickbait_files[:800]

# Move files for validation
for file in clickbait_to_move:
    shutil.move(os.path.join(clickbait_dir, file), os.path.join(val_clickbait_dir, file))

for file in non_clickbait_to_move:
    shutil.move(os.path.join(non_clickbait_dir, file), os.path.join(val_non_clickbait_dir, file))

print("Successfully moved 800 images to validation folders.")


Successfully moved 800 images to validation folders.


In [None]:
val_list1 = os.listdir("data_dir/val/clickbait")
val_list2 = os.listdir("data_dir/val/non-clickbait")
print(len(val_list1))
print(len(val_list2))

800
800


In [None]:
data = "/content/data_dir"

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
import time
import os

def train_model(model, dataloaders, criterion, optimizer, num_epochs=10, device="cuda"):
    since = time.time()

    # Keep track of best model weights
    best_model_wts = model.state_dict()
    best_acc = 0.0

    # Loop over each epoch
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        print("-" * 10)

        # Each epoch has a training and validation phase
        for phase in ["train", "val"]:
            if phase == "train":
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluation mode

            running_loss = 0.0
            running_corrects = 0
            total_samples = 0

            # Iterate over data
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward pass
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)  # Get predicted class
                    loss = criterion(outputs, labels)

                    # Backward + optimize only if in training phase
                    if phase == "train":
                        loss.backward()
                        optimizer.step()

                # Statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                total_samples += inputs.size(0)

            epoch_loss = running_loss / total_samples
            epoch_acc = running_corrects.double() / total_samples

            print(f"{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")

            # Deep copy the model if it's the best so far
            if phase == "val" and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()

    # Load best model weights
    model.load_state_dict(best_model_wts)
    time_elapsed = time.time() - since
    print(f"Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s")
    print(f"Best val Accuracy: {best_acc:.4f}")

    return model


In [None]:
# Define data transforms
data_transforms = {
    "train": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    "val": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

# Load data
data_dir = data
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
                  for x in ["train", "val"]}
dataloaders = {x: DataLoader(image_datasets[x], batch_size=16, shuffle=True, num_workers=4)
               for x in ["train", "val"]}

# Define the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=True)

# Modify the final fully connected layer for binary classification
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)  # 2 classes: clickbait and non-clickbait
model = model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
model = train_model(model, dataloaders, criterion, optimizer, num_epochs=10, device=device)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 170MB/s]


Epoch 1/10
----------
train Loss: 0.1443 Acc: 0.9601
val Loss: 0.2549 Acc: 0.8831
Epoch 2/10
----------
train Loss: 0.1088 Acc: 0.9689
val Loss: 0.0950 Acc: 0.9688
Epoch 3/10
----------
train Loss: 0.0927 Acc: 0.9733
val Loss: 0.0983 Acc: 0.9706
Epoch 4/10
----------
train Loss: 0.0967 Acc: 0.9700
val Loss: 0.0905 Acc: 0.9713
Epoch 5/10
----------
train Loss: 0.0882 Acc: 0.9738
val Loss: 0.0858 Acc: 0.9744
Epoch 6/10
----------
train Loss: 0.0819 Acc: 0.9749
val Loss: 0.1282 Acc: 0.9656
Epoch 7/10
----------
train Loss: 0.0791 Acc: 0.9759
val Loss: 0.0868 Acc: 0.9756
Epoch 8/10
----------
train Loss: 0.0836 Acc: 0.9742
val Loss: 0.0909 Acc: 0.9719
Epoch 9/10
----------
train Loss: 0.0784 Acc: 0.9769
val Loss: 0.0916 Acc: 0.9700
Epoch 10/10
----------
train Loss: 0.0709 Acc: 0.9776
val Loss: 0.0987 Acc: 0.9750
Training complete in 12m 43s
Best val Accuracy: 0.9756


In [None]:
torch.save(model.state_dict(), "resnet50_clickbait_classifier.pth")
print("Model training complete and saved!")


Model training complete and saved!


In [None]:
from PIL import Image
import torch

# Get class names from the training dataset
class_names = image_datasets["train"].classes  # Extract class labels from the training dataset

# Load the saved model
model.load_state_dict(torch.load("resnet50_clickbait_classifier.pth"))
model.eval()

# Transform the input image and predict
def predict_image(image_path):
    image = Image.open(image_path).convert("RGB")  # Ensure the image is in RGB format
    input_tensor = data_transforms["val"](image).unsqueeze(0).to(device)
    outputs = model(input_tensor)
    _, preds = torch.max(outputs, 1)  # Get the index of the predicted class
    return class_names[preds[0]]  # Map the predicted index to the class name

# Test on a new image
print(predict_image("default3.jpg"))


  model.load_state_dict(torch.load("resnet50_clickbait_classifier.pth"))


clickbait


In [None]:
from transformers import PreTrainedModel, PretrainedConfig
import torch.nn as nn
import torch

# Define a custom configuration class
class ResNetConfig(PretrainedConfig):
    model_type = "resnet"

    def __init__(self, num_labels=2, **kwargs):
        super().__init__(**kwargs)
        self.num_labels = num_labels


# Define a custom ResNet model class
class ResNetForImageClassification(PreTrainedModel):
    config_class = ResNetConfig

    def __init__(self, config):
        super().__init__(config)
        from torchvision import models
        self.resnet = models.resnet50(pretrained=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, config.num_labels)  # Modify output layer

    def forward(self, pixel_values, labels=None):
        logits = self.resnet(pixel_values)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}


In [None]:
from transformers import AutoProcessor

# Load the configuration and model
config = ResNetConfig(num_labels=2)
model = ResNetForImageClassification(config)

# Load the original state_dict
state_dict = torch.load("resnet50_clickbait_classifier.pth")

# Modify keys to match the expected prefix
new_state_dict = {f"resnet.{key}": value for key, value in state_dict.items()}

# Load the modified state_dict into the model
model.load_state_dict(new_state_dict)

# Save the model and processor
model.save_pretrained("resnet50_clickbait")
processor = AutoProcessor.from_pretrained("microsoft/resnet-50")
processor.save_pretrained("resnet50_clickbait")

  state_dict = torch.load("resnet50_clickbait_classifier.pth")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

['resnet50_clickbait/preprocessor_config.json']

In [None]:
!pip install huggingface_hub



In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `resnet50-clickbait-2` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to 

In [None]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="resnet50_clickbait",  # Path to your folder containing model.safetensors, config.json, and preprocessor_config.json
    repo_id="Arunavaonly/YouTube-Clickbait-Video-Detection-Model",  # Replace with your model's repo ID
    repo_type="model"
)


model.safetensors:   0%|          | 0.00/94.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Arunavaonly/YouTube-Clickbait-Video-Detection-Model/commit/519a993f0408c8a10d2dfcc5d6f6cf403820fb3a', commit_message='Upload folder using huggingface_hub', commit_description='', oid='519a993f0408c8a10d2dfcc5d6f6cf403820fb3a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Arunavaonly/YouTube-Clickbait-Video-Detection-Model', endpoint='https://huggingface.co', repo_type='model', repo_id='Arunavaonly/YouTube-Clickbait-Video-Detection-Model'), pr_revision=None, pr_num=None)

In [None]:
model.push_to_hub("Arunavaonly/YouTube-Clickbait-Video-Detection-Model")

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Arunavaonly/YouTube-Clickbait-Video-Detection-Model/commit/519a993f0408c8a10d2dfcc5d6f6cf403820fb3a', commit_message='Upload ResNetForImageClassification', commit_description='', oid='519a993f0408c8a10d2dfcc5d6f6cf403820fb3a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Arunavaonly/YouTube-Clickbait-Video-Detection-Model', endpoint='https://huggingface.co', repo_type='model', repo_id='Arunavaonly/YouTube-Clickbait-Video-Detection-Model'), pr_revision=None, pr_num=None)