In [1]:
%pip install torch transformers scikit-learn pandas numpy matplotlib seaborn tqdm nlpaug nltk datasets tf-keras accelerate

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Data Loading and Inspection
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import resample
import re
import string
from nlpaug.augmenter.word import SynonymAug
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data_path = '../../Data/Dataset_2.csv'
df = pd.read_csv(data_path, encoding='latin1', delimiter=',', quotechar='"')

# Inspect the dataset
print(df.head())
print(df.info())
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
df.dropna(subset=['tweet'], inplace=True)

# Map classes to binary labels
df['label'] = df['class'].apply(lambda x: 1 if x == 0 else 0)  # 1: Hate Speech, 0: Non-Hate Speech

# Drop unnecessary columns
df = df[['tweet', 'label']]

# Check class distribution
print(df['label'].value_counts())

# Data Preprocessing
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions (@user)
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    return text

df['clean_tweet'] = df['tweet'].apply(preprocess_text)

# Separate majority and minority classes
df_majority = df[df.label == 0]  # Non-Hate Speech
df_minority = df[df.label == 1]  # Hate Speech

# Data Balancing
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # Sample with replacement
                                 n_samples=len(df_majority),    # Match majority class
                                 random_state=42)  # Reproducible results

# Combine majority class with upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Verify balanced class distribution
print(df_balanced['label'].value_counts())

# Tokenization
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize and encode data
def encode_data(texts, tokenizer, max_length=128):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return encodings

# Encode training and testing data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_balanced['clean_tweet'], df_balanced['label'], test_size=0.2, random_state=42, stratify=df_balanced['label']
)

train_encodings = encode_data(train_texts, tokenizer)
val_encodings = encode_data(val_texts, tokenizer)

# Dataset Creation for BERT
class HateSpeechDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return item

# Create datasets
train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)

  from .autonotebook import tqdm as notebook_tqdm



   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          100 no



In [3]:
# BERT Model Training
# Initialize model
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, use_cache=False, gradient_checkpointing=True)

# Define training arguments
training_args_bert = TrainingArguments(
    output_dir='./results_bert',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    fp16=True, 
    fp16_full_eval=True,
)

# Initialize trainer
trainer_bert = Trainer(
    model=model_bert,
    args=training_args_bert,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer_bert.train()

# Evaluate the model
results_bert = trainer_bert.evaluate()
print(results_bert)

# BERT Model Evaluation
# Predict on validation set
predictions_bert = trainer_bert.predict(val_dataset)
pred_labels_bert = predictions_bert.predictions.argmax(-1)

# Generate classification report
print(classification_report(val_labels, pred_labels_bert, target_names=['Non-Hate Speech', 'Hate Speech']))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                
 10%|█         | 10/100 [00:38<04:49,  3.21s/it]

{'eval_loss': 0.11268638074398041, 'eval_runtime': 1.1951, 'eval_samples_per_second': 33.469, 'eval_steps_per_second': 2.51, 'epoch': 1.0}


                                                
 20%|██        | 20/100 [01:13<04:30,  3.38s/it]

{'eval_loss': 0.01480932254344225, 'eval_runtime': 1.1261, 'eval_samples_per_second': 35.52, 'eval_steps_per_second': 2.664, 'epoch': 2.0}


                                                
 30%|███       | 30/100 [01:46<03:34,  3.07s/it]

{'eval_loss': 0.0034730557817965746, 'eval_runtime': 1.124, 'eval_samples_per_second': 35.588, 'eval_steps_per_second': 2.669, 'epoch': 3.0}


                                                
 40%|████      | 40/100 [02:19<03:05,  3.09s/it]

{'eval_loss': 0.0016525499522686005, 'eval_runtime': 1.1832, 'eval_samples_per_second': 33.806, 'eval_steps_per_second': 2.535, 'epoch': 4.0}


                                                
 50%|█████     | 50/100 [02:51<02:28,  2.98s/it]

{'eval_loss': 0.001128064701333642, 'eval_runtime': 1.1659, 'eval_samples_per_second': 34.307, 'eval_steps_per_second': 2.573, 'epoch': 5.0}


                                                
 60%|██████    | 60/100 [03:24<01:59,  3.00s/it]

{'eval_loss': 0.0008961202693171799, 'eval_runtime': 1.2386, 'eval_samples_per_second': 32.294, 'eval_steps_per_second': 2.422, 'epoch': 6.0}


                                                
 70%|███████   | 70/100 [03:56<01:28,  2.96s/it]

{'eval_loss': 0.0007861878839321434, 'eval_runtime': 1.1954, 'eval_samples_per_second': 33.461, 'eval_steps_per_second': 2.51, 'epoch': 7.0}


                                                
 80%|████████  | 80/100 [04:28<00:59,  3.00s/it]

{'eval_loss': 0.0007242898573167622, 'eval_runtime': 1.1902, 'eval_samples_per_second': 33.609, 'eval_steps_per_second': 2.521, 'epoch': 8.0}


                                                
 90%|█████████ | 90/100 [05:05<00:36,  3.67s/it]

{'eval_loss': 0.0006911147502250969, 'eval_runtime': 1.5685, 'eval_samples_per_second': 25.502, 'eval_steps_per_second': 1.913, 'epoch': 9.0}


                                                 
100%|██████████| 100/100 [06:18<00:00,  3.79s/it]


{'eval_loss': 0.0006797086098231375, 'eval_runtime': 1.6065, 'eval_samples_per_second': 24.899, 'eval_steps_per_second': 1.867, 'epoch': 10.0}
{'train_runtime': 378.5218, 'train_samples_per_second': 4.121, 'train_steps_per_second': 0.264, 'train_loss': 0.04100136280059814, 'epoch': 10.0}


100%|██████████| 3/3 [00:03<00:00,  1.13s/it]


{'eval_loss': 0.0009760856628417969, 'eval_runtime': 5.8284, 'eval_samples_per_second': 6.863, 'eval_steps_per_second': 0.515, 'epoch': 10.0}


100%|██████████| 3/3 [00:03<00:00,  1.07s/it]


                 precision    recall  f1-score   support

Non-Hate Speech       1.00      1.00      1.00        20
    Hate Speech       1.00      1.00      1.00        20

       accuracy                           1.00        40
      macro avg       1.00      1.00      1.00        40
   weighted avg       1.00      1.00      1.00        40



In [4]:
# LSTM + CNN Model Definition
import torch.nn as nn
import torch.nn.functional as F

class LSTM_CNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, lstm_hidden_dim, cnn_hidden_dim, num_classes, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, lstm_hidden_dim, batch_first=True)
        self.conv1d = nn.Conv1d(lstm_hidden_dim, cnn_hidden_dim, kernel_size=3, padding=1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(cnn_hidden_dim, num_classes)
    
    def forward(self, x):
        # Handle both float and long inputs
        if x.dtype == torch.float32:
            x = x.long()
        
        # Regular forward pass
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        
        # Reshape for CNN
        conv_in = lstm_out.transpose(1, 2)
        conv_out = self.conv1d(conv_in)
        
        # Global max pooling
        pooled = torch.max(conv_out, dim=2)[0]
        
        # Final classification
        dropped = self.dropout(pooled)
        output = self.fc(dropped)
        return output

In [5]:
# Vocabulary Building and Text Sequencing
from collections import Counter

# Build vocabulary
def build_vocab(texts, max_vocab_size=10000):
    counter = Counter()
    for text in texts:
        tokens = text.split()  # Simple whitespace-based tokenization
        counter.update(tokens)
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(counter.most_common(max_vocab_size))}
    vocab[' '] = 0  # Padding token
    return vocab

# Convert text to sequences
def text_to_sequence(text, vocab, max_len=100):
    tokens = text.split()
    sequence = [vocab.get(token, 0) for token in tokens]  # Use 0 for unknown tokens
    sequence = sequence[:max_len]  # Truncate if longer than max_len 
    sequence += [0] * (max_len - len(sequence))  # Pad if shorter than max_len
    return sequence

# Build vocabulary from the dataset
vocab = build_vocab(df_balanced['clean_tweet'], max_vocab_size=10000)

# Convert all texts to sequences
X_sequences = [text_to_sequence(text, vocab, max_len=100) for text in df_balanced['clean_tweet']]
X_sequences = torch.tensor(X_sequences, dtype=torch.long)


# Split data (convert labels to NumPy array)
X_train, X_val, y_train, y_val = train_test_split(
    X_sequences, 
    df_balanced['label'].values,  # Convert to NumPy array
    test_size=0.2, 
    random_state=42, 
    stratify=df_balanced['label']
)

# Create datasets for LSTM + CNN
class HateSpeechDatasetLSTM_CNN(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        # Convert labels to a numpy array if it's a pandas Series
        self.labels = torch.tensor(labels.to_numpy() if isinstance(labels, pd.Series) else labels)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# Create datasets for LSTM + CNN
train_dataset_lstm_cnn = HateSpeechDatasetLSTM_CNN(X_train, y_train)
val_dataset_lstm_cnn = HateSpeechDatasetLSTM_CNN(X_val, y_val)

In [6]:
# Define hyperparameters for LSTM + CNN
vocab_size = len(vocab)
embed_dim = 100
lstm_hidden_dim = 128
cnn_hidden_dim = 128
num_classes = 2
dropout = 0.5

# Initialize model for LSTM + CNN
model_lstm_cnn = LSTM_CNN(vocab_size, embed_dim, lstm_hidden_dim, cnn_hidden_dim, num_classes, dropout)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_lstm_cnn.parameters(), lr=1e-3)

# Define training loop for LSTM + CNN
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, labels = batch
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader)}')
        
        # Evaluate on validation set
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, labels = batch
                outputs = model(input_ids)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print(f'Validation Loss: {val_loss/len(val_loader)}, Accuracy: {correct/total}')

# Create DataLoader for LSTM + CNN
train_loader = DataLoader(train_dataset_lstm_cnn, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset_lstm_cnn, batch_size=16, shuffle=False)

# Train the LSTM + CNN model
train_model(model_lstm_cnn, criterion, optimizer, train_loader, val_loader, num_epochs=10)

Epoch 1/10, Loss: 0.5468898057937622
Validation Loss: 0.3733650545279185, Accuracy: 1.0
Epoch 2/10, Loss: 0.2108850184828043
Validation Loss: 0.060985119392474495, Accuracy: 1.0
Epoch 3/10, Loss: 0.021182815986685456
Validation Loss: 0.003503371961414814, Accuracy: 1.0
Epoch 4/10, Loss: 0.0009577138407621533
Validation Loss: 0.0005716304876841605, Accuracy: 1.0
Epoch 5/10, Loss: 0.00018391310150036588
Validation Loss: 0.000246094382115795, Accuracy: 1.0
Epoch 6/10, Loss: 9.0331923274789e-05
Validation Loss: 0.00017013044271152467, Accuracy: 1.0
Epoch 7/10, Loss: 6.602233697776683e-05
Validation Loss: 0.00014498387827188708, Accuracy: 1.0
Epoch 8/10, Loss: 5.61921480766614e-05
Validation Loss: 0.00013390692765824497, Accuracy: 1.0
Epoch 9/10, Loss: 5.420879570010584e-05
Validation Loss: 0.0001285583627274415, Accuracy: 1.0
Epoch 10/10, Loss: 4.906716603727546e-05
Validation Loss: 0.00012418506715524322, Accuracy: 1.0


In [7]:
# Data Augmentation
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nlpaug.augmenter.word import SynonymAug

aug = SynonymAug(aug_src='wordnet')

def augment_text(text, augmenter, num_aug=5):
    try:
        augmented = augmenter.augment(text, n=num_aug)
        return augmented if augmented else []
    except Exception as e:
        print(f"Augmentation failed for text: {text}. Error: {str(e)}")
        return []

# Augment the dataset
augmented_texts = []
augmented_labels = []

# Reset index to ensure contiguous integer indexing
df_balanced = df_balanced.reset_index(drop=True)  # Critical fix

for i, text in enumerate(df_balanced['clean_tweet']):
    try:
        augmented = augment_text(text, aug)
        if len(augmented) > 0:
            augmented_texts.extend(augmented)
            augmented_labels.extend([df_balanced['label'].iloc[i]] * len(augmented))
    except Exception as e:
        print(f"Error augmenting text: {text}. Error: {e}")

# Verify lengths match
print(f"Augmented texts: {len(augmented_texts)}, Augmented labels: {len(augmented_labels)}")

# Create augmented DataFrame
df_augmented = pd.DataFrame({'clean_tweet': augmented_texts, 'label': augmented_labels})

# Combine and split data
df_combined = pd.concat([df_balanced, df_augmented]).reset_index(drop=True)

# Convert combined texts to sequences
X_combined = [text_to_sequence(text, vocab, max_len=100) for text in df_combined['clean_tweet']]
X_combined = torch.tensor(X_combined, dtype=torch.long)

# Split data with error handling
try:
    X_train_combined, X_val_combined, y_train_combined, y_val_combined = train_test_split(
        X_combined, df_combined['label'], test_size=0.2, random_state=42, stratify=df_combined['label']
    )
except ValueError:
    # Fallback if stratify fails
    X_train_combined, X_val_combined, y_train_combined, y_val_combined = train_test_split(
        X_combined, df_combined['label'], test_size=0.2, random_state=42
    )

# Create datasets
train_dataset_combined = HateSpeechDatasetLSTM_CNN(X_train_combined, y_train_combined)
val_dataset_combined = HateSpeechDatasetLSTM_CNN(X_val_combined, y_val_combined)

# DataLoader
train_loader_combined = DataLoader(train_dataset_combined, batch_size=16, shuffle=True)
val_loader_combined = DataLoader(val_dataset_combined, batch_size=16, shuffle=False)

# Train with augmented data
train_model(model_lstm_cnn, criterion, optimizer, train_loader_combined, val_loader_combined, num_epochs=10)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aditya Mohan
[nltk_data]     Khade\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Aditya Mohan
[nltk_data]     Khade\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Aditya Mohan
[nltk_data]     Khade\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Augmented texts: 980, Augmented labels: 980
Epoch 1/10, Loss: 0.002300557940836401
Validation Loss: 1.699403934859826e-05, Accuracy: 1.0
Epoch 2/10, Loss: 1.1309421925959167e-05
Validation Loss: 8.718175179941074e-06, Accuracy: 1.0
Epoch 3/10, Loss: 7.449379115884875e-06
Validation Loss: 6.47841126616792e-06, Accuracy: 1.0
Epoch 4/10, Loss: 5.7099986641764715e-06
Validation Loss: 5.26134607904775e-06, Accuracy: 1.0
Epoch 5/10, Loss: 4.685210910793103e-06
Validation Loss: 4.446768434718251e-06, Accuracy: 1.0
Epoch 6/10, Loss: 3.909315938024951e-06
Validation Loss: 3.816298211252919e-06, Accuracy: 1.0
Epoch 7/10, Loss: 3.3507437116618273e-06
Validation Loss: 3.3113258192922028e-06, Accuracy: 1.0
Epoch 8/10, Loss: 2.8941619147791258e-06
Validation Loss: 2.889797159847755e-06, Accuracy: 1.0
Epoch 9/10, Loss: 2.57029895985428e-06
Validation Loss: 2.5846604520059675e-06, Accuracy: 1.0
Epoch 10/10, Loss: 2.254601613111521e-06
Validation Loss: 2.2979014564346772e-06, Accuracy: 1.0


In [8]:
# Data Augmentation
import nltk
import copy
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nlpaug.augmenter.word import SynonymAug

aug = SynonymAug(aug_src='wordnet')

def augment_text(text, augmenter, num_aug=5):
    try:
        augmented = augmenter.augment(text, n=num_aug)
        return augmented if augmented else []
    except Exception as e:
        print(f"Augmentation failed for text: {text}. Error: {str(e)}")
        return []

# Augment the dataset
augmented_texts = []
augmented_labels = []

# Reset index to ensure contiguous integer indexing
df_balanced = df_balanced.reset_index(drop=True)  # Critical fix

for i, text in enumerate(df_balanced['clean_tweet']):
    try:
        augmented = augment_text(text, aug)
        if len(augmented) > 0:
            augmented_texts.extend(augmented)
            augmented_labels.extend([df_balanced['label'].iloc[i]] * len(augmented))
    except Exception as e:
        print(f"Error augmenting text: {text}. Error: {e}")

# Verify lengths match
print(f"Augmented texts: {len(augmented_texts)}, Augmented labels: {len(augmented_labels)}")

# Create augmented DataFrame
df_augmented = pd.DataFrame({'clean_tweet': augmented_texts, 'label': augmented_labels})

# Combine and split data
df_combined = pd.concat([df_balanced, df_augmented]).reset_index(drop=True)

# Convert combined texts to sequences
X_combined = [text_to_sequence(text, vocab, max_len=100) for text in df_combined['clean_tweet']]
X_combined = torch.tensor(X_combined, dtype=torch.long)

# Split data with error handling
try:
    X_train_combined, X_val_combined, y_train_combined, y_val_combined = train_test_split(
        X_combined, df_combined['label'], test_size=0.2, random_state=42, stratify=df_combined['label']
    )
except ValueError:
    # Fallback if stratify fails
    X_train_combined, X_val_combined, y_train_combined, y_val_combined = train_test_split(
        X_combined, df_combined['label'], test_size=0.2, random_state=42
    )

# Create datasets for LSTM + CNN
train_dataset_combined = HateSpeechDatasetLSTM_CNN(X_train_combined, y_train_combined)
val_dataset_combined = HateSpeechDatasetLSTM_CNN(X_val_combined, y_val_combined)

# DataLoader
train_loader_combined = DataLoader(train_dataset_combined, batch_size=16, shuffle=True)
val_loader_combined = DataLoader(val_dataset_combined, batch_size=16, shuffle=False)

# Define hyperparameters for LSTM + CNN
vocab_size = len(vocab)
embed_dim = 100
lstm_hidden_dim = 128
cnn_hidden_dim = 128
num_classes = 2
dropout = 0.5

# Initialize model for LSTM + CNN
model_lstm_cnn = LSTM_CNN(vocab_size, embed_dim, lstm_hidden_dim, cnn_hidden_dim, num_classes, dropout)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_lstm_cnn.parameters(), lr=1e-3)

# Define training loop for LSTM + CNN
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, labels = batch
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader)}')
        
        # Evaluate on validation set
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, labels = batch
                outputs = model(input_ids)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print(f'Validation Loss: {val_loss/len(val_loader)}, Accuracy: {correct/total}')

# Train with augmented data
train_model(model_lstm_cnn, criterion, optimizer, train_loader_combined, val_loader_combined, num_epochs=10)

# Adversarial Training
from torch.autograd import Variable

def adversarial_train(model, criterion, optimizer, train_loader, val_loader, epsilon=0.01, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch in train_loader:
            input_ids, labels = batch
            
            # STEP 1: Regular forward pass and loss calculation
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            
            # STEP 2: Detach everything to avoid backward graph issues
            loss_value = loss.item()  # Get the scalar value
            
            # STEP 3: Create a copy of the model for adversarial example generation
            adv_model = copy.deepcopy(model)
            
            # STEP 4: Regular backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            
            # STEP 5: Generate adversarial examples using the copied model
            # Apply perturbation to parameters of the copied model
            for p_orig, p_adv in zip(model.parameters(), adv_model.parameters()):
                if p_orig.grad is not None:
                    # Apply perturbation to the adversarial model
                    p_adv.data = p_orig.data + epsilon * torch.sign(p_orig.grad)
            
            # STEP 6: Forward pass with adversarial model
            adv_outputs = adv_model(input_ids)
            adv_loss = criterion(adv_outputs, labels)
            adv_loss_value = adv_loss.item()  # Get the scalar value
            
            # STEP 7: Update the original model with combined loss
            # We'll scale the gradients by (1 + adv_loss_value/loss_value)
            # This mimics the effect of adding the adversarial loss
            if loss_value > 0:
                scale_factor = 1.0 + (adv_loss_value / loss_value)
                for p in model.parameters():
                    if p.grad is not None:
                        p.grad.data *= scale_factor
            
            # STEP 8: Update parameters
            optimizer.step()
            
            # Track total loss
            epoch_loss += (loss_value + adv_loss_value)
        
        avg_loss = epoch_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}')
        
        # Evaluation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, labels = batch
                outputs = model(input_ids)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        val_avg_loss = val_loss / len(val_loader)
        accuracy = correct / total
        print(f'Validation Loss: {val_avg_loss:.6f}, Accuracy: {accuracy:.4f}')

# Adversarial train LSTM + CNN
# Adversarial train LSTM + CNN
adversarial_train(model_lstm_cnn, criterion, optimizer, train_loader_combined, val_loader_combined, epsilon=0.01, num_epochs=10)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aditya Mohan
[nltk_data]     Khade\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Aditya Mohan
[nltk_data]     Khade\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Aditya Mohan
[nltk_data]     Khade\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Augmented texts: 980, Augmented labels: 980
Epoch 1/10, Loss: 0.21434751825438716
Validation Loss: 0.01193646084672461, Accuracy: 0.9957627118644068
Epoch 2/10, Loss: 0.004503156672137142
Validation Loss: 0.0017961252519550423, Accuracy: 1.0
Epoch 3/10, Loss: 0.0011189450382578614
Validation Loss: 0.0008884680554425965, Accuracy: 1.0
Epoch 4/10, Loss: 0.0005553946715026849
Validation Loss: 0.0004826804336820108, Accuracy: 1.0
Epoch 5/10, Loss: 0.0003329137062653406
Validation Loss: 0.0003304954358706406, Accuracy: 1.0
Epoch 6/10, Loss: 0.0002258159727994175
Validation Loss: 0.00023949839160195552, Accuracy: 1.0
Epoch 7/10, Loss: 0.0001592193924483643
Validation Loss: 0.0001831476777927795, Accuracy: 1.0
Epoch 8/10, Loss: 0.00011962741236703104
Validation Loss: 0.00014103800955732974, Accuracy: 1.0
Epoch 9/10, Loss: 9.309392093416398e-05
Validation Loss: 0.00011454651915604094, Accuracy: 1.0
Epoch 10/10, Loss: 7.40914558449176e-05
Validation Loss: 9.437453033266744e-05, Accuracy: 1.0
Ep