In [None]:
!pip install captum

Collecting captum
  Downloading captum-0.8.0-py3-none-any.whl.metadata (26 kB)
Collecting numpy<2.0 (from captum)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.10->captum)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.10->captum)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.10->captum)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.10->captum)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1

In [None]:
!pip install transformers
!pip install transformers_interpret
!pip install evaluate
!pip install rouge_score
!pip install bert_score

Collecting transformers_interpret
  Downloading transformers_interpret-0.10.0-py3-none-any.whl.metadata (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython<8.0.0,>=7.31.1->transformers_interpret)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading transformers_interpret-0.10.0-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, transformers_interpret
Successfully installed jedi-0.19.2 transformers_interpret-0.10.0
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downlo

In [None]:
pip install --upgrade transformers



In [None]:
import pandas as pd
import torch
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from scipy.spatial.distance import cosine
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM, Trainer, TrainingArguments,
    pipeline, GPT2LMHeadModel, GPT2TokenizerFast
)
from captum.attr import LayerIntegratedGradients
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Initialize metric tools globally
smoothie = SmoothingFunction().method4
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)
fluency_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
fluency_tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
sentiment_pipeline = pipeline("sentiment-analysis",
                            model="distilbert-base-uncased-finetuned-sst-2-english",
                            device=0 if torch.cuda.is_available() else -1)

def load_data(data_path):
    df = pd.read_csv(data_path)
    return df

def preprocess_data(df):
    df = df.drop_duplicates(subset=['toxic_sentence'])
    df['toxic_sentence'] = df['toxic_sentence'].astype(str).str.strip()
    df['neutral_sentence'] = df['neutral_sentence'].astype(str).str.strip()
    return df

def split_data(df, test_size=0.2, val_size=0.1):
    train_val, test = train_test_split(df, test_size=test_size, random_state=42)
    train_size = 1 - val_size / (1 - test_size)
    train, val = train_test_split(train_val, train_size=train_size, random_state=42)
    return train, val, test

class ToxicityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, max_length=self.max_length, padding="max_length",
                              truncation=True, return_tensors="pt")
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

class Seq2SeqDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.max_length = max_length

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source_encoding = self.tokenizer(
            self.source_texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            self.target_texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        labels = target_encoding["input_ids"].clone()
        labels[labels == self.tokenizer.pad_token_id] = -100
        return {
            "input_ids": source_encoding["input_ids"].squeeze(),
            "attention_mask": source_encoding["attention_mask"].squeeze(),
            "labels": labels.squeeze()
        }

def train_toxicity_classifier(train_dataset, val_dataset, model_name="bert-base-multilingual-cased", output_dir="./toxicity_classifier"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        ignore_mismatched_sizes=True
    ).to(device)

    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )
    trainer.train()
    trainer.save_model(output_dir)
    return model, tokenizer

def compute_word_attributions(model, tokenizer, text, target=1):
    model.eval()
    encoded_input = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    input_ids = encoded_input["input_ids"]
    attention_mask = encoded_input["attention_mask"]

    def forward_func(input_ids, attention_mask):
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits[:, target]

    lig = LayerIntegratedGradients(forward_func, model.bert.embeddings)
    baseline_input_ids = torch.ones_like(input_ids, device=device) * tokenizer.pad_token_id
    attributions, delta = lig.attribute(
        inputs=(input_ids, attention_mask),
        baselines=(baseline_input_ids, attention_mask),
        return_convergence_delta=True,
        internal_batch_size=4,
        n_steps=50
    )
    attributions = attributions[0].sum(dim=-1).squeeze(0).detach().cpu().numpy()
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
    return list(zip(tokens, attributions))

def delete_toxic_words(text, token_attributions, tokenizer, threshold=0.5):
    toxic_tokens = [token for token, score in token_attributions if score > threshold and not token.startswith("##")]
    toxic_words = []
    for token in toxic_tokens:
        if not token.startswith("##"):
            toxic_words.append(token)
        else:
            if toxic_words:
                toxic_words[-1] += token[2:]
    words = text.split()
    clean_words = [word for word in words if tokenizer.tokenize(word)[0] not in toxic_words]
    return " ".join(clean_words)

def train_reconstruction_model(train_dataset, val_dataset, model_name="facebook/mbart-large-50", output_dir="./reconstruction_model"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        learning_rate=1e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )
    trainer.train()
    trainer.save_model(output_dir)
    return model, tokenizer

def delete_and_reconstruct_pipeline(toxic_text, toxicity_model, reconstruction_model, toxicity_tokenizer, reconstruction_tokenizer, threshold=0.5):
    toxicity_model.eval()
    reconstruction_model.eval()

    token_attributions = compute_word_attributions(toxicity_model, toxicity_tokenizer, toxic_text)
    text_with_toxic_words_removed = delete_toxic_words(toxic_text, token_attributions, toxicity_tokenizer, threshold)

    inputs = reconstruction_tokenizer(
        text_with_toxic_words_removed,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to(device)

    outputs = reconstruction_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=128,
        num_beams=4,
        early_stopping=True
    )
    return reconstruction_tokenizer.decode(outputs[0], skip_special_tokens=True)

def calculate_metrics(original_texts, generated_texts, reference_texts):
    bleu_scores = []
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    similarity_scores = []
    sentiment_scores = []
    fluency_scores = []

    for orig, gen, ref in zip(original_texts, generated_texts, reference_texts):
        if len(gen.strip()) < 3:
            continue

        # BLEU Score
        try:
            ref_tokens = nltk.word_tokenize(ref.lower())
            gen_tokens = nltk.word_tokenize(gen.lower())
            bleu = sentence_bleu([ref_tokens], gen_tokens, smoothing_function=smoothie)
            bleu_scores.append(bleu)
        except:
            bleu_scores.append(0.0)

        # ROUGE Scores
        try:
            rouge_scores = scorer.score(ref, gen)
            rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
            rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
            rougeL_scores.append(rouge_scores['rougeL'].fmeasure)
        except:
            rouge1_scores.append(0.0)
            rouge2_scores.append(0.0)
            rougeL_scores.append(0.0)

        # Semantic Similarity
        try:
            orig_embed = similarity_model.encode(orig, convert_to_tensor=True)
            gen_embed = similarity_model.encode(gen, convert_to_tensor=True)
            similarity = 1 - cosine(orig_embed.cpu().numpy(), gen_embed.cpu().numpy())
            similarity_scores.append(similarity)
        except:
            similarity_scores.append(0.0)

        # Sentiment Analysis
        try:
            sent_result = sentiment_pipeline(gen[:512])[:1]
            sentiment_score = sent_result[0]['score'] if sent_result[0]['label'] == 'POSITIVE' else 1 - sent_result[0]['score']
            sentiment_scores.append(sentiment_score)
        except:
            sentiment_scores.append(0.5)

        # Fluency (Perplexity)
        try:
            encodings = fluency_tokenizer(gen, return_tensors='pt', truncation=True, max_length=512).to(device)
            max_length = fluency_model.config.n_positions
            stride = 512
            lls = []

            for i in range(0, encodings.input_ids.size(1), stride):
                begin_loc = max(i + stride - max_length, 0)
                end_loc = i + stride
                input_ids = encodings.input_ids[:, begin_loc:end_loc]
                target_ids = input_ids.clone()
                target_ids[:, :-stride] = -100

                with torch.no_grad():
                    outputs = fluency_model(input_ids, labels=target_ids)
                    log_likelihood = outputs[0] * stride

                lls.append(log_likelihood)

            ppl = torch.exp(torch.stack(lls).sum() / end_loc).item()
            fluency_scores.append(ppl)
        except:
            fluency_scores.append(100.0)

    return {
        'bleu': np.mean(bleu_scores) if bleu_scores else 0.0,
        'rouge1': np.mean(rouge1_scores) if rouge1_scores else 0.0,
        'rouge2': np.mean(rouge2_scores) if rouge2_scores else 0.0,
        'rougeL': np.mean(rougeL_scores) if rougeL_scores else 0.0,
        'similarity': np.mean(similarity_scores) if similarity_scores else 0.0,
        'sentiment': np.mean(sentiment_scores) if sentiment_scores else 0.5,
        'fluency': np.mean(fluency_scores) if fluency_scores else 100.0
    }

def main():
    # Load and prepare data
    data_path = "/content/drive/MyDrive/Paradetox/multi_all.csv"
    df = preprocess_data(load_data(data_path))
    train_df, val_df, test_df = split_data(df)

    # Train toxicity classifier
    train_texts = train_df["toxic_sentence"].tolist() + train_df["neutral_sentence"].tolist()
    val_texts = val_df["toxic_sentence"].tolist() + val_df["neutral_sentence"].tolist()
    train_labels = [1]*len(train_df["toxic_sentence"]) + [0]*len(train_df["neutral_sentence"])
    val_labels = [1]*len(val_df["toxic_sentence"]) + [0]*len(val_df["neutral_sentence"])

    toxicity_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    train_toxicity_dataset = ToxicityDataset(train_texts, train_labels, toxicity_tokenizer)
    val_toxicity_dataset = ToxicityDataset(val_texts, val_labels, toxicity_tokenizer)
    toxicity_model, _ = train_toxicity_classifier(train_toxicity_dataset, val_toxicity_dataset)

    # Train reconstruction model
    train_modified_texts = [
        delete_toxic_words(
            text,
            compute_word_attributions(toxicity_model, toxicity_tokenizer, text),
            toxicity_tokenizer
        ) for text in train_df["toxic_sentence"].tolist()
    ]
    train_non_toxic_texts = train_df["neutral_sentence"].tolist()

    reconstruction_tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")
    train_reconstruction_dataset = Seq2SeqDataset(train_modified_texts, train_non_toxic_texts, reconstruction_tokenizer)
    val_modified_texts = [
        delete_toxic_words(
            text,
            compute_word_attributions(toxicity_model, toxicity_tokenizer, text),
            toxicity_tokenizer
        ) for text in val_df["toxic_sentence"].tolist()
    ]
    val_non_toxic_texts = val_df["neutral_sentence"].tolist()
    val_reconstruction_dataset = Seq2SeqDataset(val_modified_texts, val_non_toxic_texts, reconstruction_tokenizer)
    reconstruction_model, reconstruction_tokenizer = train_reconstruction_model(train_reconstruction_dataset, val_reconstruction_dataset)

    # Test pipeline
    test_toxic_texts = test_df["toxic_sentence"].tolist()
    test_reference_texts = test_df["neutral_sentence"].tolist()
    detoxified_texts = [
        delete_and_reconstruct_pipeline(
            text,
            toxicity_model,
            reconstruction_model,
            toxicity_tokenizer,
            reconstruction_tokenizer
        ) for text in test_toxic_texts
    ]

    # Calculate and print metrics
    metrics = calculate_metrics(test_toxic_texts, detoxified_texts, test_reference_texts)
    print("\nEvaluation Metrics:")
    print(f"BLEU Score: {metrics['bleu']:.4f}")
    print(f"ROUGE-1 F1: {metrics['rouge1']:.4f}")
    print(f"ROUGE-2 F1: {metrics['rouge2']:.4f}")
    print(f"ROUGE-L F1: {metrics['rougeL']:.4f}")
    print(f"Semantic Similarity: {metrics['similarity']:.4f}")
    print(f"Sentiment Score: {metrics['sentiment']:.4f}")
    print(f"Fluency (Perplexity): {metrics['fluency']:.2f}")

    # Save results
    results_df = pd.DataFrame({
        'original': test_toxic_texts,
        'generated': detoxified_texts,
        'reference': test_reference_texts
    })
    results_df.to_csv('/content/drive/MyDrive/Paradetox/detoxification_results.csv', index=False)

if __name__ == "__main__":
    main()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabhishekbhamare18[0m ([33mabhishekbhamare18-national-institute-of-technology-karnataka[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.42291
2,0.491200,0.405841
3,0.491200,0.447118


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.830448
2,2.761300,1.512219
3,2.761300,1.498162
4,1.125700,1.498093
5,0.884900,1.532131


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Evaluation Metrics:
BLEU Score: 0.4511
ROUGE-1 F1: 0.3014
ROUGE-2 F1: 0.2481
ROUGE-L F1: 0.3003
Semantic Similarity: 0.9439
Sentiment Score: 0.2120
Fluency (Perplexity): 253.31
