In [4]:
!pip install gdown # Install gdown if you haven't already

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import gdown

# Replace with your file's Google Drive ID
file_id = '1unzca2u1H_I-GtPAvFGdCesoNBPUCqZs'
url = f'https://drive.google.com/uc?id={file_id}'

# Download the file to your Colab environment
output_file = 'downloaded_file.csv' # Choose a name for the downloaded file
gdown.download(url, output_file, quiet=False)

# Now read the downloaded CSV file
df = pd.read_csv(output_file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Downloading...
From: https://drive.google.com/uc?id=1unzca2u1H_I-GtPAvFGdCesoNBPUCqZs
To: /content/downloaded_file.csv
100%|██████████| 4.92M/4.92M [00:00<00:00, 223MB/s]


In [5]:
# verify that the upload was sucessfull
import os
print(os.listdir())

['.config', 'drive', 'downloaded_file.csv', 'sample_data']


In [28]:
# Import necessary libraries
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

In [29]:
data = data.dropna(subset=['content', 'category'])

In [30]:
# Map categories to numeric labels
categories = data['category'].unique()
category_to_id = {cat: idx for idx, cat in enumerate(category)}
data['label'] = data['category'].map(category_to_id)

# Define texts and labels
texts = data['content'].fillna("").values  # Replace NaN with an empty string
labels = data['label'].values  # Numeric labels

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [31]:
# Step 1: Dataset Preparation
class WebsiteDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize text using BERT tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset and DataLoader
dataset = WebsiteDataset(texts, labels, tokenizer)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [32]:
# Print a batch
for batch in loader:
    print(batch)
    break

{'input_ids': tensor([[  101,  6373,  1009,  1011,  5460, 14668,  5691,  1998,  2694,  2186,
         10866,  4024,  2005,  2035,  1012, 14012,  3488,  3225,  2012,  1002,
          2184,  1012,  5585,  1013,  3204,  3225,  2012,  1002,  2184,  1012,
          5585,  1013,  9587,  1012,  7276,  3408,  6611,  3225,  2012,  1002,
          2385,  1012,  5585,  1013,  9587,  1012,  7276,  3408,  6611,  1075,
         11387, 18827,  6373,  1998,  2049,  3141, 11422,  1012,  4098,  1075,
         11387, 18827,  2188,  3482,  2436,  1010,  4297,  1012,  2035,  1997,
          2122,  1998,  2062,  2085, 11058,  1012,  3193,  2035,  2933,  7047,
         10866,  4024,  2005,  2035,  1012, 14012,  3488,  3225,  2012,  1002,
          2184,  1012,  5585,  1013,  3204,  3225,  2012,  1002,  2184,  1012,
          5585,  1013,  9587,  1012,  7276,  3408,  6611,  3225,  2012,  1002,
          2385,  1012,  5585,  1013,  9587,  1012,  7276,  3408,  6611,  1075,
         11387, 18827,  6373,  1998,  

In [33]:
# Load the data
data = pd.read_csv('downloaded_file.csv')
data = data.dropna(subset=['content', 'category'])  # Drop rows with missing content or category

In [34]:
# Step 3: Define the Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Assuming 'category' is a column in your DataFrame 'data'
category = data['category'].unique()  # Get unique categories from the 'category' column

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(category))
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [36]:
# Step 4: Training Loop
def train_model(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

In [None]:
# Training Loop
for epoch in range(3):  # Train for 3 epochs
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    print(f'Epoch {epoch + 1}, Training Loss: {train_loss}')

In [45]:
# Import necessary libraries
from torch.utils.data import DataLoader

# Prepare the validation dataset (assuming val_texts and val_labels are already defined)
val_dataset = WebsiteDataset(val_texts, val_labels, tokenizer)

# Define the validation DataLoader
val_loader = DataLoader(val_dataset, batch_size=16)

NameError: name 'val_texts' is not defined

In [40]:
# Define the evaluation function
from sklearn.metrics import classification_report, accuracy_score

# Step 5: Evaluation
def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    print(classification_report(true_labels, predictions, target_names=categories))
    print("Accuracy:", accuracy_score(true_labels, predictions))

In [41]:
# Evaluate the model
evaluate_model(model, val_loader, device)

NameError: name 'val_loader' is not defined