In [1]:
import pandas as pd

# Load the uploaded dataset to examine its structure and content
file_path = 'P:/Aswath/unique_cyberbullying_comments.xlsx'
dataset = pd.read_excel(file_path)

# Display the first few rows of the dataset to understand its structure
dataset.head()
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  2000 non-null   object
 1   label    2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Load the dataset
file_path = "P:/Aswath/unique_cyberbullying_comments.xlsx"  # Update with your file path
data = pd.read_excel(file_path)

# Encode labels
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['comment'], data['label_encoded'], test_size=0.2, random_state=42
)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class CyberbullyingDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts.iloc[idx],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors="pt",
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': self.labels.iloc[idx],
        }

train_dataset = CyberbullyingDataset(train_texts, train_labels)
val_dataset = CyberbullyingDataset(val_texts, val_labels)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Define Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./cyberbullying_model")
tokenizer.save_pretrained("./cyberbullying_model")

# Export the label encoder
import pickle
with open("./cyberbullying_model/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("Model saved to ./cyberbullying_model")


ImportError: cannot import name 'BertForSequenceClassification' from 'transformers' (p:\Applicaton\Anaconda\anaconda3\Lib\site-packages\transformers\__init__.py)