### Finetuning With Pre-trained Albert Model

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

model_name = 'textattack/albert-base-v2-imdb'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Set maximum sequence length
MAX_LEN = 128

In [None]:
# configuring path
data_path = '../input/amazon-customerreviews-polarity'
train_data_path = data_path + '/train.csv'
test_data_path = data_path + '/test.csv'
output_path = '../working/'
model_path = output_path + 'model/'
output_file_path = output_path + 'file/'
!mkdir "$model_path"
!mkdir "$output_file_path"

In [None]:
# data loading
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

# column addition
train_df.columns = ['polarity','title','text']
test_df.columns = ['polarity','title','text']

In [None]:
# Function to convert score to sentiment
def to_sentiment(rating):
    
    rating = int(rating)
    
    # Convert to class
    if rating == 1: # negative
        return 0
    elif rating == 2: # Positive
        return 1
    else:
        return 3

# Apply to the dataset 
train_df['polarity'] = train_df.polarity.apply(to_sentiment)
test_df['polarity'] = test_df.polarity.apply(to_sentiment)

In [None]:
# Let's check for missing values in train data
print('---------Training Data-----------')
print(train_df.isnull().sum())

# Let's check for missing values in test data
print('---------Test Data-----------')
print(test_df.isnull().sum())

In [None]:
# Substituting Null values with empty spaces
train_df['title'] = train_df['title'].fillna(' ')
test_df['title'] = test_df['title'].fillna(' ')

# Let's check for missing values in train data
print('---------Training Data-----------')
print(train_df.isnull().sum())

# Let's check for missing values in test data
print('---------Test Data-----------')
print(test_df.isnull().sum())

In [None]:
train_df['review'] = train_df['title'].astype(str) + ' ' + train_df['text'].astype(str)
test_df['review'] = test_df['title'].astype(str) + ' ' + test_df['text'].astype(str)

print('---------Training Data Shape-----------')
print(train_df.shape)

print('---------Test Data Shape-----------')
print(test_df.shape)

In [None]:
# Randomly sample 1Lac elements from your dataframe
train_df = train_df.sample(n=100000)
test_df = test_df.sample(n=10000)
print(train_df.shape,train_df.shape)
print(test_df.shape,test_df.shape)

In [None]:
# Separating out independent and dependent columns
X_train = train_df.drop(['polarity'],axis = 1)
Y_train = train_df.drop(['title','text','review'],axis = 1)
X_test = test_df.drop(['polarity'],axis = 1)
Y_test = test_df.drop(['title','text','review'],axis = 1)

X_train = X_train.drop(['title','text'],axis = 1)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=4, stratify=Y_train)

In [None]:
# Create DataLoaders for train and val sets
train_dataset = CustomDataset(X_train['review'], Y_train['polarity'], tokenizer, MAX_LEN)
val_dataset = CustomDataset(X_val['review'], Y_val['polarity'], tokenizer, MAX_LEN)
test_dataset = CustomDataset(X_test['review'], Y_test['polarity'], tokenizer, MAX_LEN)

In [None]:
# Define training parameters
batch_size = 32
epochs = 10
lr = 2e-5
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(device)

In [None]:
# Freeze all layers except the classification layer
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the classification layer
for param in model.classifier.parameters():
    param.requires_grad = True

In [None]:
def calculate_accuracy(model, loader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    return 100 * correct / total

In [None]:
best_val_loss = float('inf')  # Initialize best_val_loss to a very high value
best_epoch = -1  # Initialize best_epoch to an invalid value to track the epoch of the best validation loss

print('---------------Training Started------------')
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    total_val_loss = 0

    # Training
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    # Check if the current validation loss is the lowest; if so, save the model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), model_path + 'albert_finetune_best_model.pth')  # Save the best model

    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

# Print the best epoch and its validation loss
print(f"The lowest validation loss was {best_val_loss:.4f} at epoch {best_epoch + 1}")

# Load the best model and calculate accuracy
model.load_state_dict(torch.load(model_path + 'albert_finetune_best_model.pth'))
train_accuracy = calculate_accuracy(model, train_loader, device)
val_accuracy = calculate_accuracy(model, val_loader, device)

print(f'Best Model Training Accuracy: {train_accuracy:.2f}%')
print(f'Best Model Validation Accuracy: {val_accuracy:.2f}%')

In [None]:
model.load_state_dict(torch.load(model_path + 'albert_finetune_best_model.pth'))
test_accuracy = calculate_accuracy(model,test_loader, device)
print(f'Test Accuracy: {test_accuracy}%')