### 1. Wykorzystać model BERT do klasyfikacji tekstu, aby rozpoznać, z której powieści (Anna Karenina lub Jane Eyre) pochodzi dany fragment tekstu.


1. Przygotuj dane wejściowe:
   - Podziel teksty obu powieści na fragmenty o stałej długości (np. 100 słów lub 5 zdań).
   - Przypisz etykiety: `0` dla *Anna Karenina*, `1` dla *Jane Eyre*.
2. Skorzystaj z modelu `BertForSequenceClassification` do klasyfikacji tekstu.
3. Przeprowadź fine-tuning modelu na przygotowanym zbiorze danych.
4. Oceń skuteczność modelu na zbiorze testowym.

In [23]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import re
import random
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
import spacy

nlp = spacy.load("en_core_web_sm")  
nlp.max_length = 2000000

def clean_text(text):
    doc = nlp(text)
    cleaned_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(cleaned_tokens)

def split_text_into_chunks(text, chunk_size=100):
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

with open('anna_karenina.txt', 'r', encoding='utf-8') as file_anna:
    book1 = file_anna.read()

with open('jane_eyre.txt', 'r', encoding='utf-8') as file_jane:
    book2 = file_jane.read()

anna_clean = clean_text(book1)
jane_clean = clean_text(book2)

anna_chunks = split_text_into_chunks(anna_clean, chunk_size=100)
jane_chunks = split_text_into_chunks(jane_clean, chunk_size=100)

texts = anna_chunks + jane_chunks
labels = [0] * len(anna_chunks) + [1] * len(jane_chunks)

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

In [24]:
from torch.utils.data import DataLoader

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Dataset
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
optimizer = AdamW(model.parameters(), lr=5e-5)

def encode_data(texts, tokenizer, max_length=100):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

train_encodings = encode_data(train_texts, tokenizer)
test_encodings = encode_data(test_texts, tokenizer)

In [28]:
# Trening modelu
def train_model(model, data_loader, optimizer):
    model.train() 
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / len(data_loader)  
    return avg_loss

In [32]:
def evaluate_model(model, loader):
    model.eval()
    correct = 0
    total = 0
    predictions, labels = [], []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels_batch = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            labels.extend(labels_batch.cpu().numpy())

            correct += (preds == labels_batch).sum().item()
            total += labels_batch.size(0)

    accuracy = correct / total
    return accuracy, predictions, labels


In [30]:
n = 2
for i in range(n):
    avg_loss = train_model(model, train_loader, optimizer)
    print(f"Epoch {i+1}, Loss: {avg_loss}")

Epoch 1, Loss: 0.6864771095189182
Epoch 2, Loss: 0.6856086790561676


In [33]:
accuracy, predictions, labels = evaluate_model(model, test_loader)
# Wyniki
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(labels, predictions, target_names=["Anna Karenina", "Jane Eyre"]))

Accuracy: 62.50%
               precision    recall  f1-score   support

Anna Karenina       0.65      0.93      0.76       287
    Jane Eyre       0.27      0.05      0.08       153

     accuracy                           0.62       440
    macro avg       0.46      0.49      0.42       440
 weighted avg       0.52      0.62      0.53       440



### 2. Wykorzystać model BERT do analizy toksyczności komentarzy.


1. Załaduj zbiór danych o toksycznych komentarzach(dostępny na platformie).
2. Skorzystaj z modelu `BertForSequenceClassification` i przeprowadź fine-tuning na tym zbiorze danych.
3. Oceń model na zbiorze testowym i zinterpretuj wyniki.
4. Przeprowadź analizę – znajdź komentarze, które model zaklasyfikował jako toksyczne, a które jako neutralne.

In [34]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

df = pd.read_csv("sample.csv")  
def get_toxic_comment(x):
    if x >= 0.5:
        return 1  
    else:
        return 0 
    
df['target'] = df['target'].apply(get_toxic_comment)

In [35]:
class CommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt') 
        return { 
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_texts, test_texts, train_labels, test_labels = train_test_split(df['comment_text'].values, df['target'].values, test_size=0.2, random_state=42)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Dataset
train_dataset = CommentDataset(train_texts, train_labels, tokenizer)
test_dataset = CommentDataset(test_texts, test_labels, tokenizer)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
def train_model(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch_data in train_loader:
        optimizer.zero_grad()
        input_ids = batch_data['input_ids']
        attention_mask = batch_data['attention_mask']
        labels = batch_data['labels']
        output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = output.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate_model(model, data_loader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batch_data in data_loader:
            input_ids = batch_data['input_ids']
            attention_mask = batch_data['attention_mask']
            labels = batch_data['labels']
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(output.logits, dim=1)
            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)
    return total_correct / total_samples

In [38]:
for epoch in range(1, 4):
    avg_loss = train_model(model, train_loader, optimizer)
    print(f"Epoch {epoch}, Average Loss: {avg_loss:.4f}")

test_accuracy = evaluate_model(model, test_loader)
print(f"Test Set Accuracy: {test_accuracy * 100:.2f}%")

Epoch 1, Average Loss: 0.2252
Epoch 2, Average Loss: 0.1425
Epoch 3, Average Loss: 0.0875
Test Set Accuracy: 93.35%


In [39]:
# Predykcje i analiza wyników
def predict_and_analyze(model, data_loader):
    model.eval()
    labels_list = []
    predictions_list = []

    with torch.no_grad():
        for batch_data in data_loader:
            input_ids = batch_data['input_ids']
            attention_mask = batch_data['attention_mask']
            labels = batch_data['labels']

            output = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(output.logits, dim=1)

            labels_list.extend(labels.cpu().numpy())
            predictions_list.extend(predictions.cpu().numpy())

    return labels_list, predictions_list

actual_labels, predicted_labels = predict_and_analyze(model, test_loader)
print(classification_report(actual_labels, predicted_labels, target_names=["Neutral", "Toxic"]))

              precision    recall  f1-score   support

     Neutral       0.97      0.96      0.96      1858
       Toxic       0.53      0.63      0.57       142

    accuracy                           0.93      2000
   macro avg       0.75      0.79      0.77      2000
weighted avg       0.94      0.93      0.94      2000



In [40]:
toxic_texts = [test_texts[i] for i, label in enumerate(predicted_labels) if label == 1]
neutral_texts = [test_texts[i] for i, label in enumerate(predicted_labels) if label == 0]
print("\nSample Toxic Comments:")
print(toxic_texts[:5])
print("\nSample Neutral Comments:")
print(neutral_texts[:5])


Sample Toxic Comments:
["Trump is under investigation for his Russian ties, and he just proved that he's a White Supremacist sympathizer, if he isn't one himself.", 'nobody has to work very hard to prove that the vile and disgusting so-called president is a racist, a bigot, and a hate-monger\n\njust sit at your computer and google Trump - Central park 5', 'oh crap...I live on the Oregon Coast!', 'Poor Jeremy.', "Yes ...... It takes a real fool to ignore the fact that the Dixiecrats all became Republicans back in the Johnson Administration.\n.\nBut in Trump's mind there are many sides that he is trying to unite:\nThe KKK side, the Nazi side, Southern Nationalists side, the 'poorly educated' unemployable white guy side, etc"]

Sample Neutral Comments:
["Ya, its almost like we need to do something besides lay off all the state workers. Entitlements cost almost all of that $3.7 billion. So we have 2 choices. reduce entitlements and spend my money on my family, or increase taxes and spend 