In [2]:
pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/410.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [3]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
Col

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nlpaug.augmenter.word as naw
import re

from transformers import BertTokenizer, BertForSequenceClassification

# Load the training and test datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Define the preprocessing function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[AP]{1}M', ' ', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'https*://\S+', ' ', text)
    text = re.sub(r'[&]+', ' ', text)
    text = re.sub(r"jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec", ' ', text)
    text = re.sub(r'A\S+:', ' ', text)
    punctuations = '!"#$%&()*+,-./:;?@[]^_`{|}~'
    for p in punctuations:
        text = text.replace(p, " ")
    text = re.sub(r'lo+l', 'laughing out loud', text)
    text = re.sub(r'coo+l', 'cool', text)
    text = re.sub(r'go+a+l+', 'goal', text)
    text = re.sub(r'so+', 'so', text)
    text = re.sub(r'bo+h+o*b', 'oh', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = re.sub(r'cant', 'can not', text)
    text = re.sub(r'wont', 'will not', text)
    text = re.sub(r'imb', 'I am', text)
    text = re.sub(r'didnt', 'did not', text)
    text = re.sub(r'couldnt', 'could not', text)
    text = re.sub(r'isnt', 'is not', text)
    text = re.sub(r'dont', 'do not', text)
    text = re.sub(r'lil', 'little', text)
    text = re.sub(r'alil', 'a little', text)
    text = re.sub(r'view and download video', ' ', text)
    text = re.sub(r'viaZ', ' ', text)
    text = ' '.join([word for word in text.split() if len(word) > 1])
    return text

# Preprocess the text data
train_df['text'] = train_df['text'].apply(preprocess)
test_df['text'] = test_df['text'].apply(preprocess)

# Initialize the BERT tokenizer and load the pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Data Augmentation
aug = naw.SynonymAug(aug_src='wordnet')

# Augment the training data
augmented_texts = [aug.augment(text) for text in train_df['text']]
augmented_labels = train_df['target']

# Combine original and augmented data
X = train_df['text'].tolist() + augmented_texts
y = train_df['target'].tolist() + augmented_labels.tolist()

X_test = test_df['text'].values

# Tokenize the text data
X_tokenized = [tokenizer.encode(text, add_special_tokens=True) for text in X]
X_test_tokenized = [tokenizer.encode(text, add_special_tokens=True) for text in X_test]

# Determine the maximum sequence length in both training and test data
max_seq_length_train = max(len(seq) for seq in X_tokenized)
max_seq_length_test = max(len(seq) for seq in X_test_tokenized)

# Pad sequences in both training and test data to have the same length as the maximum sequence length
X_padded = np.array([seq + [0] * (max_seq_length_train - len(seq)) for seq in X_tokenized], dtype=np.int64)
X_test_padded = np.array([seq + [0] * (max_seq_length_test - len(seq)) for seq in X_test_tokenized], dtype=np.int64)

X_tensor = torch.tensor(X_padded)
X_test_tensor = torch.tensor(X_test_padded, dtype=torch.long)
y_tensor = torch.tensor(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.3, random_state=42)

# Create data loaders for training and validation
batch_size = 16
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Define the model, optimizer, and loss function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Adding Dropout to the Last Transformer Layer (Layer 11)
model.bert.encoder.layer[11].output.dropout = nn.Dropout(0.5)

# Weight Regularization (L2 regularization)
weight_decay = 0.01
optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=weight_decay)

loss_fn = nn.CrossEntropyLoss()

# Early Stopping Parameters
max_patience = 3  # Number of epochs with no improvement on validation loss
best_val_loss = float('inf')
current_patience = 0

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs, labels=labels)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            predicted_class = torch.argmax(probabilities, dim=1)
            val_predictions.extend(predicted_class.cpu().numpy())
            val_true_labels.extend(labels.cpu().numpy())
            val_loss += loss_fn(logits, labels).item()

    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}, Validation Loss: {val_loss / len(val_loader)}, Validation Accuracy: {val_accuracy}')

    # Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        current_patience = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        current_patience += 1
        if current_patience >= max_patience:
            print(f'Early stopping at epoch {epoch}.')
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_df['id'], 'target': y_pred})
submission_df.to_csv('submission.csv', index=False)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


KeyboardInterrupt: ignored

In [9]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
nltk.download('punkt')
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from nltk.tokenize import word_tokenize
import nlpaug.augmenter.word as naw
import re

from transformers import BertTokenizer, BertForSequenceClassification

# Load the training and test datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Define the preprocessing function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[AP]{1}M', ' ', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'https*://\S+', ' ', text)
    text = re.sub(r'[&]+', ' ', text)
    text = re.sub(r"jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec", ' ', text)
    text = re.sub(r'A\S+:', ' ', text)
    punctuations = '!"#$%&()*+,-./:;?@[]^_`{|}~'
    for p in punctuations:
        text = text.replace(p, " ")
    text = re.sub(r'lo+l', 'laughing out loud', text)
    text = re.sub(r'coo+l', 'cool', text)
    text = re.sub(r'go+a+l+', 'goal', text)
    text = re.sub(r'so+', 'so', text)
    text = re.sub(r'bo+h+o*b', 'oh', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = re.sub(r'cant', 'can not', text)
    text = re.sub(r'wont', 'will not', text)
    text = re.sub(r'imb', 'I am', text)
    text = re.sub(r'didnt', 'did not', text)
    text = re.sub(r'couldnt', 'could not', text)
    text = re.sub(r'isnt', 'is not', text)
    text = re.sub(r'dont', 'do not', text)
    text = re.sub(r'lil', 'little', text)
    text = re.sub(r'alil', 'a little', text)
    text = re.sub(r'view and download video', ' ', text)
    text = re.sub(r'viaZ', ' ', text)

    # Tokenize text using NLTK
    tokens = word_tokenize(text)

    text = ' '.join([word for word in text.split() if len(word) > 1])
    return text

# Preprocess the text data
train_df['text'] = train_df['text'].apply(preprocess)
test_df['text'] = test_df['text'].apply(preprocess)

# Initialize the BERT tokenizer and load the pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Data Augmentation
aug = naw.SynonymAug(aug_src='wordnet')

# Augment the training data
augmented_texts = [aug.augment(text) for text in train_df['text']]
augmented_labels = train_df['target']

# Combine original and augmented data
X = train_df['text'].tolist() + augmented_texts
y = train_df['target'].tolist() + augmented_labels.tolist()

X_test = test_df['text'].values

# Tokenize the text data
X_tokenized = [tokenizer.encode(text, add_special_tokens=True) for text in X]
X_test_tokenized = [tokenizer.encode(text, add_special_tokens=True) for text in X_test]

# Determine the maximum sequence length in both training and test data
max_seq_length_train = max(len(seq) for seq in X_tokenized)
max_seq_length_test = max(len(seq) for seq in X_test_tokenized)

# Pad sequences in both training and test data to have the same length as the maximum sequence length
X_padded = np.array([seq + [0] * (max_seq_length_train - len(seq)) for seq in X_tokenized], dtype=np.int64)
X_test_padded = np.array([seq + [0] * (max_seq_length_test - len(seq)) for seq in X_test_tokenized], dtype=np.int64)

X_tensor = torch.tensor(X_padded)
X_test_tensor = torch.tensor(X_test_padded, dtype=torch.long)
y_tensor = torch.tensor(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Create data loaders for training and validation
batch_size = 16
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Define the model, optimizer, and loss function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Adding Dropout to the Last Transformer Layer (Layer 11)
model.bert.encoder.layer[11].output.dropout = nn.Dropout(0.6)

# Weight Regularization (L2 regularization)
weight_decay = 0.01
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=weight_decay)

loss_fn = nn.CrossEntropyLoss()

# Early Stopping Parameters
max_patience = 3  # Number of epochs with no improvement on validation loss
best_val_loss = float('inf')
current_patience = 0

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs, labels=labels)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            predicted_class = torch.argmax(probabilities, dim=1)
            val_predictions.extend(predicted_class.cpu().numpy())
            val_true_labels.extend(labels.cpu().numpy())
            val_loss += loss_fn(logits, labels).item()

    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}, Validation Loss: {val_loss / len(val_loader)}, Validation Accuracy: {val_accuracy}')

    # Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        current_patience = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        current_patience += 1
        if current_patience >= max_patience:
            print(f'Early stopping at epoch {epoch}.')
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_df['id'], 'target': y_test_pred})
submission_df.to_csv('submission.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Loss: 0.569419539037339, Validation Loss: 0.5576421042699464, Validation Accuracy: 0.6841759684832567
Epoch 2/10, Loss: 0.5061556425072703, Validation Loss: 0.555724503560216, Validation Accuracy: 0.6927117531188444
Epoch 3/10, Loss: 0.4591769597974662, Validation Loss: 0.5893488096473105, Validation Accuracy: 0.6808929743926461
Epoch 4/10, Loss: 0.419699182191233, Validation Loss: 0.6490518901672663, Validation Accuracy: 0.6785948785292186
Epoch 5/10, Loss: 0.39660721851067904, Validation Loss: 0.6846659844302382, Validation Accuracy: 0.6710439921208142
Early stopping at epoch 4.
