Detecting Unfriendly Comments Using Deep Learning
=====================================

**Name**: Hong Liang

**Net ID**: hl5414

In [None]:
!pip install kaggle
from google.colab import files

# Upload kaggle.json
files.upload()
# Move kaggle.json to the appropriate directory
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json  # Secure the file

!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification




Saving kaggle.json to kaggle.json
Downloading jigsaw-unintended-bias-in-toxicity-classification.zip to /content
 97% 705M/723M [00:04<00:00, 144MB/s]
100% 723M/723M [00:04<00:00, 165MB/s]


In [None]:
!unzip jigsaw-unintended-bias-in-toxicity-classification.zip

Archive:  jigsaw-unintended-bias-in-toxicity-classification.zip
  inflating: all_data.csv            
  inflating: identity_individual_annotations.csv  
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: test_private_expanded.csv  
  inflating: test_public_expanded.csv  
  inflating: toxicity_individual_annotations.csv  
  inflating: train.csv               


In [None]:
!pip install protobuf==3.20.*

Collecting protobuf==3.20.*
  Downloading protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (679 bytes)
Downloading protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.1 MB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.0/1.1 MB[0m [31m16.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.5
    Uninstalling protobuf-4.25.5:
      Successfully uninstalled protobuf-4.25.5
[31mERROR: pip's dependency resolver does not currently take into ac

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Load the dataset
def load_data(file_path):
    # Replace with the actual data loading logic
    data = pd.read_csv(file_path)
    print(data.columns)
    data = data[['target', 'comment_text']]
    data['label'] = (data['target'] > 0.5).astype(int)  # Binary classification (0 = Friendly, 1 = Unfriendly)
    return data

class CommentDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_len):
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, item):
        comment = str(self.comments[item])
        label = self.labels[item]

        encoding = self.tokenizer(
            comment,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Define compute_metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Main function
def main():
    data = load_data("train.csv")

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        data['comment_text'],
        data['label'],
        test_size=0.3,
        random_state=42
    )




    train_dataset = CommentDataset(train_texts.values, train_labels.values, tokenizer, max_len=128)
    val_dataset = CommentDataset(val_texts.values, val_labels.values, tokenizer, max_len=128)

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=2,  # Reduced epochs
        per_device_train_batch_size=64,  # Maximum batch size that fits in memory
        per_device_eval_batch_size=256,  # Maximum evaluation batch size
        warmup_steps=100,  # Minimal warmup steps
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,  # Log even less frequently
        evaluation_strategy="epoch",  # Evaluate after each epoch
        save_strategy="no",  # Skip model checkpointing during training
        fp16=True,  # Mixed precision training
        dataloader_num_workers=8,  # Increase data loading workers
        disable_tqdm=True  # Disable progress bar for speed
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.evaluate()

if __name__ == "__main__":
    main()



Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count'],
      dtype='object')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


{'loss': 0.3358, 'grad_norm': 2.3965296745300293, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.0050655995137024465}
{'loss': 0.1544, 'grad_norm': 1.722949743270874, 'learning_rate': 4.9875577675079985e-05, 'epoch': 0.010131199027404893}
{'loss': 0.1341, 'grad_norm': 1.992794156074524, 'learning_rate': 4.974861611903916e-05, 'epoch': 0.01519679854110734}
{'loss': 0.12, 'grad_norm': 1.048354148864746, 'learning_rate': 4.9621654562998325e-05, 'epoch': 0.020262398054809786}
{'loss': 0.1243, 'grad_norm': 2.208587169647217, 'learning_rate': 4.9494693006957495e-05, 'epoch': 0.025327997568512232}
{'loss': 0.1193, 'grad_norm': 2.2358791828155518, 'learning_rate': 4.9367731450916665e-05, 'epoch': 0.03039359708221468}
{'loss': 0.1231, 'grad_norm': 0.8871657252311707, 'learning_rate': 4.9240769894875834e-05, 'epoch': 0.035459196595917127}
{'loss': 0.1067, 'grad_norm': 2.5412566661834717, 'learning_rate': 4.9113808338835004e-05, 'epoch': 0.04052479610961957}
{'loss': 0.1029, 'grad_norm': 1.6