In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Define your target columns
TARGET_COLS = [
    'toxicity', 
    'severe_toxicity', 
    'obscene', 
    'threat', 
    'insult', 
    'identity_attack', 
    'sexual_explicit'
]

df_test=pd.read_parquet("/kaggle/input/civil-comments/test-00000-of-00001.parquet")

df_train1=pd.read_parquet("/kaggle/input/civil-comments/train-00000-of-00002.parquet")
df_train2=pd.read_parquet("/kaggle/input/civil-comments/train-00001-of-00002.parquet")

df_val=pd.read_parquet("/kaggle/input/civil-comments/validation-00000-of-00001.parquet")

df = pd.concat([df_test, df_train1, df_train2, df_val], ignore_index=True)

# Split data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
sample_df = train_df.sample(n=70000, random_state=42)
train_df = sample_df
sample_df = val_df.sample(n=14000, random_state=42)
val_df = sample_df

In [None]:
class ToxicityDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=128):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        targets = self.targets[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.FloatTensor(targets)
        }

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
train_dataset = ToxicityDataset(
    train_df['text'].values,
    train_df[TARGET_COLS].values,
    tokenizer
)

val_dataset = ToxicityDataset(
    val_df['text'].values,
    val_df[TARGET_COLS].values,
    tokenizer
)

In [None]:
BATCH_SIZE = 16

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4
)

In [None]:
!pip install hf_xet

In [None]:
class ToxicityRegressor(torch.nn.Module):
    def __init__(self, n_classes):
        super(ToxicityRegressor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(p=0.2)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, n_classes)
        self.sigmoid = torch.nn.Sigmoid()  # Since your targets are between 0-1
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        output = self.linear(output)
        return self.sigmoid(output)

# model = ToxicityRegressor(len(TARGET_COLS))
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = model.to(device)

# Initialize model
model = ToxicityRegressor(len(TARGET_COLS))

# Multi-GPU setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpus = torch.cuda.device_count()

print(f"Number of GPUS: {n_gpus}")

if n_gpus > 1:
    print(f"Using {n_gpus} GPUs!")
    model = torch.nn.DataParallel(model)
    
model = model.to(device)

In [None]:
EPOCHS = 1
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.MSELoss()  # Mean Squared Error loss for regression

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    losses = []
    
    for batch in tqdm(data_loader, desc='Training'):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        loss = criterion(outputs, targets)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
    
    return np.mean(losses)

def eval_epoch(model, data_loader, device):
    model.eval()
    losses = []
    all_targets = []
    all_outputs = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            loss = criterion(outputs, targets)
            losses.append(loss.item())
            
            all_targets.extend(targets.cpu().numpy())
            all_outputs.extend(outputs.cpu().numpy())
    
    # Calculate MAE for each target
    mae_scores = {}
    all_targets = np.array(all_targets)
    all_outputs = np.array(all_outputs)
    
    for i, col in enumerate(TARGET_COLS):
        mae_scores[col] = mean_absolute_error(
            all_targets[:, i], 
            all_outputs[:, i]
        )
    
    return np.mean(losses), mae_scores

In [None]:
best_val_loss = float('inf')
history = {'train_loss': [], 'val_loss': [], 'val_mae': []}

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss, val_mae = eval_epoch(model, val_loader, device)
    
    print(f'Train loss: {train_loss:.4f}')
    print(f'Val loss: {val_loss:.4f}')
    print('Validation MAE:')
    for k, v in val_mae.items():
        print(f'  {k}: {v:.4f}')
    
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['val_mae'].append(val_mae)
    
    if val_loss < best_val_loss:
        torch.save(model.state_dict(), 'best_model.bin')
        best_val_loss = val_loss

In [None]:
def predict_toxicity(text, model, tokenizer, device, max_len=128):
    model.eval()
    
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
    
    outputs = outputs.cpu().flatten().numpy()
    return dict(zip(TARGET_COLS, outputs))

# Example usage
sample_text = "this is a toxic comment"
predictions = predict_toxicity(sample_text, model, tokenizer, device)

print("Toxicity Predictions:")
for k, v in predictions.items():
    print(f"{k}: {v:.4f}")

SAVE THE MODEL TO REUSE

In [None]:
import os
import json
import torch

# Create directory if it doesn't exist
os.makedirs('/kaggle/working/toxicity_model', exist_ok=True)

# 1. Save model weights (handle DataParallel wrapping)
if isinstance(model, torch.nn.DataParallel):
    # If using multiple GPUs, get the underlying model
    torch.save(model.module.state_dict(), '/kaggle/working/toxicity_model/pytorch_model.bin')
    # Get the config from the underlying BERT model
    bert_config = model.module.bert.config
else:
    torch.save(model.state_dict(), '/kaggle/working/toxicity_model/pytorch_model.bin')
    bert_config = model.bert.config

# 2. Save model configuration
bert_config.save_pretrained('/kaggle/working/toxicity_model')

# 3. Save tokenizer
tokenizer.save_pretrained('/kaggle/working/toxicity_model')

# 4. Save target columns
with open('/kaggle/working/toxicity_model/target_columns.json', 'w') as f:
    json.dump(TARGET_COLS, f)

print("Model saved successfully in '/kaggle/working/toxicity_model' directory")

LOAD THE MODEL

In [None]:
from transformers import BertConfig, BertTokenizer
import json
import torch
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_absolute_error
from transformers import BertTokenizer, BertModel
# from torch.optim import AdamW
# from torch.utils.data import Dataset, DataLoader
# from tqdm import tqdm

class ToxicityRegressor(torch.nn.Module):
    def __init__(self, n_classes):
        super(ToxicityRegressor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(p=0.2)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, n_classes)
        self.sigmoid = torch.nn.Sigmoid()  # Since your targets are between 0-1
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        output = self.linear(output)
        return self.sigmoid(output)

def load_toxicity_model(model_dir, device):
    """Load the model from saved files"""
    # 1. Load configuration
    config = BertConfig.from_pretrained(model_dir)
    
    # 2. Load tokenizer
    tokenizer = BertTokenizer.from_pretrained(model_dir)
    
    # 3. Load target columns
    with open(f'{model_dir}/target_columns.json', 'r') as f:
        target_cols = json.load(f)
    
    # 4. Initialize model architecture
    model = ToxicityRegressor(len(target_cols))
    
    # 5. Load weights (map to device if needed)
    state_dict = torch.load(f'{model_dir}/pytorch_model.bin', map_location=device)
    model.load_state_dict(state_dict) 

    n_gpus = torch.cuda.device_count()
    print(f"Number of GPUS: {n_gpus}")
    if n_gpus > 1:
        print(f"Using {n_gpus} GPUs!")
        model = torch.nn.DataParallel(model)
        
    model = model.to(device)
    model.eval()
    
    return model, tokenizer, target_cols

# Usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model, tokenizer, TARGET_COLS = load_toxicity_model('/kaggle/working/toxicity_model', device)

In [None]:
def predict_batch(texts, model, tokenizer, device, batch_size=16, max_len=128): 
    model.eval()
    all_predictions = []
    
    # Process in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize batch
        encoding = tokenizer.batch_encode_plus(
            batch_texts,
            add_special_tokens=True,
            max_length=max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        # Move to device
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        
        # Predict
        with torch.no_grad():
            outputs = model(input_ids=input_ids, 
                          attention_mask=attention_mask)
        
        # Handle output based on model type
        if isinstance(outputs, torch.Tensor):
            batch_preds = outputs.cpu().numpy()
        else:  # DataParallel
            batch_preds = outputs[0].cpu().numpy()
        
        # Convert to list of dicts
        for pred in batch_preds:
            all_predictions.append(dict(zip(TARGET_COLS, pred)))
    
    return all_predictions

# Example batch usage
texts = [
    "I love this product!",
    "Go die in a hole, you worthless scum",
    "The weather is nice today",
    "People like you should be exterminated",
    "Citizens of US are very dumb",
]

results = predict_batch(texts, model, tokenizer, device)

for text, pred in zip(texts, results):
    print(f"\nText: {text[:50]}...")
    for metric, score in pred.items():
        if score > 0.3:  # Only show significant scores
            print(f"{metric:>16}: {score:.4f}")

SAVE AND LOAD SIMPLER

In [None]:
# Saving
save_dict = {
    'state_dict': model.module.state_dict() if isinstance(model, torch.nn.DataParallel) else model.state_dict(),
    'tokenizer': tokenizer,
    'target_cols': TARGET_COLS,
    'config': model.module.bert.config if isinstance(model, torch.nn.DataParallel) else model.bert.config
}

torch.save(save_dict, 'toxicity_model_full.pt')

In [None]:
# Loading
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = torch.load('toxicity_model_full.pt', map_location=device)

model = ToxicityRegressor(len(checkpoint['target_cols']))
model.load_state_dict(checkpoint['state_dict'])
model.to(device)
tokenizer = checkpoint['tokenizer']
TARGET_COLS = checkpoint['target_cols']

In [None]:
def predict_batch(texts, model, tokenizer, device, batch_size=16, max_length=128):
    # Tokenize all texts
    inputs = tokenizer(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        add_special_tokens=True
    )
    
    # Create dataset
    dataset = torch.utils.data.TensorDataset(
        inputs['input_ids'],
        inputs['attention_mask']
    )
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    
    all_scores = []
    
    # Predict in batches
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            batch_scores = outputs.cpu().numpy()
            all_scores.extend(batch_scores)
    
    # Convert to list of dictionaries
    results = []
    for scores in all_scores:
        results.append({col: score for col, score in zip(TARGET_COLS, scores)})
    
    return results

# Example batch usage
texts = [
    "I love this product!",
    "Go die in a hole, you worthless scum",
    "The weather is nice today",
    "People like you should be exterminated",
    "Citizens of US are very dumb",
]

batch_results = predict_batch(texts, model, tokenizer, device)

for text, scores in zip(texts, batch_results):
    print(f"\nText: {text}")
    for metric, score in scores.items():
        if score > 0.3:  # Only show significant scores
            print(f"{metric:>20}: {score:.4f}")

In [None]:
rm /kaggle/working/best_model.bin