In [25]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.8.1-py3-none-any.whl (365 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.8.1


In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np


In [3]:
data =pd.read_csv("clickbait_title_classification.csv")
data.head()

Unnamed: 0,title,clickbait
0,""".asia"" domain applications near 300,000 on op...",0
1,"""1 Indian + 1 Indian = Unrelatable"": Televisio...",1
2,"""7th Heaven"" television series comes to an end",0
3,"""Arm Glow"" Is Your New Life Goal, Thanks To Lu...",1
4,"""Beans Memes"" Is The Only Twitter Account That...",1


In [10]:
data['clickbait'].value_counts()

clickbait
0    16001
1    15999
Name: count, dtype: int64

In [12]:


texts = data["title"].tolist()
labels = data["clickbait"].tolist()

In [15]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Токенизация текста
encoded_inputs = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=64,  # Оптимально для заголовков
    return_tensors="pt"
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [17]:
import torch

class ClickbaitDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
        
    def __len__(self):
        return len(self.labels)

dataset = ClickbaitDataset(encoded_inputs, labels)

In [18]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # 0 = обычный, 1 = кликбейт
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from sklearn.model_selection import train_test_split

train_dataset, test_dataset = train_test_split(dataset, test_size=0.2)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import pandas as pd

# 1. Загрузка данных
df = data
texts = df["title"].tolist()
labels = df["clickbait"].tolist()

# 2. Подготовка данных
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Разделение данных
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# 3. Создание Dataset
class ClickbaitDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

train_dataset = ClickbaitDataset(train_texts, train_labels, tokenizer)
val_dataset = ClickbaitDataset(val_texts, val_labels, tokenizer)

# 4. Создание DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# 5. Определение модели
class ClickbaitClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

model = ClickbaitClassifier()

# 6. Настройка обучения
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# 7. Функция для обучения
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    
    for batch in data_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)

# 8. Функция для валидации
def eval_epoch(model, data_loader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return total_loss / len(data_loader), correct / total

# 9. Обучение модели
epochs = 3
best_accuracy = 0

for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss, val_accuracy = eval_epoch(model, val_loader, device)
    
    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Train loss: {train_loss:.4f}')
    print(f'Val loss: {val_loss:.4f}')
    print(f'Val accuracy: {val_accuracy:.4f}')
    
    # Сохраняем лучшую модель
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_model.pt')

# 10. Загрузка лучшей модели
model.load_state_dict(torch.load('best_model.pt'))

# 11. Функция для предсказания
def predict_clickbait(text, model, tokenizer, device, max_len=64):
    encoding = tokenizer(
        text,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, prediction = torch.max(outputs, dim=1)
    
    return 'Clickbait' if prediction.item() == 1 else 'Not Clickbait'

# Пример использования
print(predict_clickbait("You won't believe what happens next!", model, tokenizer, device))
print(predict_clickbait("The results of the study were published today", model, tokenizer, device))

In [None]:
def predict_clickbait(title):
    inputs = tokenizer(title, return_tensors="pt", truncation=True, max_length=64)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return probs[0][1].item()  # Вероятность кликбейта

# Пример использования
print(predict_clickbait("You won't believe what happened next!"))  # Вероятно >0.9