In [1]:
import pandas as pd
data_bin = pd.read_csv(r'C:\Users\erich\Desktop\DS_project\data\cleaned_data.csv')

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

lemmatizer = WordNetLemmatizer()

def data_binary(df):
    
    df = df[df['HTML_Content'].isin(['Teen And Up Audiences', 'Mature'])]
    
    df = df.dropna(subset=['TXT_Content'])

    def clean_text(text):

        if not isinstance(text, str):
            text = str(text)

        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        tokenized_text = word_tokenize(text)
        cleaned_text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in set(stopwords.words('english'))]

        return ' '.join(cleaned_text)

    df['TXT_Content'] = df['TXT_Content'].apply(clean_text)

    return df

data_bin = data_binary(data_bin)
print(data_bin)



               HTML_Content                                        TXT_Content
0     Teen And Up Audiences  huckleberry finn stood front old wooden door m...
1     Teen And Up Audiences  right huck trying figure every sort way could ...
3     Teen And Up Audiences  air hot brow wet mind exhausted st petersburg ...
5     Teen And Up Audiences  rope burned skin rubbing flesh raw shoved push...
6     Teen And Up Audiences  tom sawyer liked consider hopeless romantic ot...
...                     ...                                                ...
5797                 Mature  sorry late robin blurted soon within earshot c...
5803  Teen And Up Audiences  seen news pat asked robin chance hang coat mor...
5804  Teen And Up Audiences  free next friday night murphy smiled phone rob...
5805                 Mature  turned walked away wan na say come baby give w...
5807                 Mature  heaven fact unlike earth pro con pro include t...

[2953 rows x 2 columns]


In [3]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, recall_score, f1_score

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

encoded_data = tokenizer(list(data_bin["TXT_Content"].values), truncation=True, padding=True, max_length=512)

labels = data_bin["HTML_Content"].astype('category').cat.codes
labels = torch.tensor(labels.tolist())

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

dataset = CustomDataset(encoded_data, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = torch.argmax(torch.tensor(preds), dim=1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        'accuracy': accuracy,
        'recall': recall,
        'f1': f1,
    }

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics, 
)

trainer.train()

results = trainer.evaluate()
print(results)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss,Accuracy,Recall,F1
1,0.6623,0.666841,0.659898,1.0,0.795107
2,0.632,0.64024,0.651438,0.902564,0.773626
3,0.5734,0.714995,0.656514,0.828205,0.760895


{'eval_loss': 0.7149954438209534, 'eval_accuracy': 0.6565143824027073, 'eval_recall': 0.8282051282051283, 'eval_f1': 0.7608951707891637, 'eval_runtime': 7.3309, 'eval_samples_per_second': 80.618, 'eval_steps_per_second': 10.094, 'epoch': 3.0}
