# BERT MODEL WITH DATA AUGMENTATION

In [None]:
import os
import torch
import pandas as pd
import nltk
import random
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, matthews_corrcoef
from nltk.corpus import wordnet
from sklearn.model_selection import StratifiedKFold

In [None]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# DATA PREPROCESSING

In [None]:
# Load the dataset from CSV file
df = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")

# Drop rows with missing values in the 'Review Text' column
df = df.dropna(subset=['Review Text'])

# Convert 'Rating' column to binary: 1 for ratings >= 4, 0 otherwise
# This is the treatment variable 
df['Rating'] = df['Rating'].apply(lambda x: 1 if x >= 4 else 0)

In [None]:
# Select random rows where 'Recommended IND' is 1
recommended_1 = df[df['Recommended IND'] == 1].sample(n=1000, random_state=42)

# Select random rows where 'Recommended IND' is 0, with replacement
recommended_0 = df[df['Recommended IND'] == 0].sample(n=1000, replace=True, random_state=42)

# Combine the two DataFrames
data_file = pd.concat([recommended_1, recommended_0])

# DATA AUGMENTATION 

In [None]:
#Define a function to get synonyms of a word
def get_synonyms(word, pos):
    #empty set to store the synonyms
    synonyms = set()
    for syn in wordnet.synsets(word, pos=pos):
        for lemma in syn.lemmas():
            #Replaces underscores and hyphens with spaces and converts the synonym to lowercase
            synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
            #Removes any non-alphanumeric characters from the synonym
            synonym = "".join([char for char in synonym if char.isalnum() or char.isspace()])
            synonyms.add(synonym)
    if word.lower() in synonyms:
        # Remove the original word from synonyms
        synonyms.remove(word.lower())  
    return list(synonyms)

In [None]:
#Define a function to perform synonym replacement on a sentence
def synonym_replacement(sentence):
    #tokenizes the input sentence 
    tokens = nltk.word_tokenize(sentence)
    #part-of-speech tagging on the tokens
    pos_tags = nltk.pos_tag(tokens)
    augmented_tokens = []
    for token, pos_tag in zip(tokens, pos_tags):
        #If the token is a verb (starts with 'V') or an adjective (starts with 'J') 
        #and is not in the exclusion list, the function attempts to find synonyms
        if (pos_tag[1].startswith('V') or pos_tag[1].startswith('J')) and token.lower() not in ['do', 'have', 'should', 'can', 'will', 'would', 'could']:  # Check if token is a verb or an adjective and not in exclusion list
            try:
                # Get the first character of the POS tag
                pos = pos_tag[1][0].lower() 
                # Convert POS tag to WordNet POS tag
                wordnet_pos = 'v' if pos == 'v' else 'a'
                #Generate synonyms for token
                synonyms = get_synonyms(token, pos=wordnet_pos)  
                if synonyms:
                    #pick a random synonym from the list and add it to the set
                    augmented_tokens.append(random.choice(synonyms))
                else:
                    #add the token to the set 
                    augmented_tokens.append(token)
            except KeyError:
                # Skip token if POS tag is unknown
                augmented_tokens.append(token)  
        else:
            #add the original token to the set
            augmented_tokens.append(token)
    #join the sentence back together
    augmented_sentence = ' '.join(augmented_tokens)
    return augmented_sentence

In [None]:
#Define the number of augmentations to be performed on each sentence
num_augmentations = 3

#Perform data augmentation
#This list will store the augmented sentences generated through synonym replacement.
augmented_texts = []

#Create a list to store corresponding labels for augmented sentences
augmented_labels = []

for i in range(num_augmentations):
    #add the text from the dataset to the list augmented_texts
    augmented_texts.extend(data_file['Review Text'].apply(synonym_replacement))
    # Extend the labels list with corresponding labels for augmented sentences
    augmented_labels.extend(data_file['Recommended IND'].values)

#Create a new DataFrame with augmented texts and corresponding labels
data_aug = pd.DataFrame({'Review Text': augmented_texts, 'Recommended IND': augmented_labels})

In [None]:
# Concatenate data_file and data_aug along axis 0 (rows)
combined_data = pd.concat([data_file[['Review Text', 'Recommended IND']], data_aug], axis=0, ignore_index=True)


# PREPROCESSING FOR TRAINING

In [None]:
def load_data(data_file):
    # Load the DataFrame from the data_file
    df = data_file
    
    # Extract the 'Review Text' and 'Recommended IND' columns
    texts = df['Review Text'].tolist()
    labels = df['Recommended IND'].tolist()
    
    # Return the texts and labels
    return texts, labels

In [None]:
# Loading data from the data file and assigning it to variables 'texts' and 'labels'
texts, labels = load_data(combined_data)

# CLASS DEFINITIONS FOR BERT 

In [None]:
# Defines the TextClassification dataset 
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length
    def __len__(self):
            return len(self.texts)
    def __getitem__(self, idx):
            text = self.texts[idx]
            labels = self.labels[idx]
            encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
            return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(labels)}

In [None]:
# Defines the BERT Classifier 
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            return logits

# DEFINE BERT VARIABLES

In [None]:
# Initialize 
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 256
batch_size = 16
num_epochs = 1
learning_rate = 2e-5

In [None]:
# Define the number of folds for cross-validation
num_folds = 5

# Initialize cross-validation splitter
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

In [None]:
#Define tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

In [None]:
#Define device and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [None]:
# Lists to store evaluation results
accuracy_scores = []
classification_reports = []
mcc_scores = []

# TRAINING AND EVALUATION FUNCTIONS

In [None]:
# Train the model 
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
# Evaluate the model 
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
def predict(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    return preds.item()

# RESULTS

In [None]:
# Iterate over the folds
for fold, (train_index, val_index) in enumerate(skf.split(texts, labels)):
    print(f"Fold {fold + 1}/{num_folds}")

    
    # Split data into train and validation sets for this fold
    train_texts = [texts[i] for i in train_index]
    train_labels = [labels[i] for i in train_index]

    val_texts= [texts[i] for i in val_index]
    val_labels = [labels[i] for i in val_index]

    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Initialize and train model for this fold
    model = BERTClassifier(bert_model_name, num_classes).to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate, no_deprecation_warning=True)
    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    for epoch in range(num_epochs):
        train(model, train_dataloader, optimizer, scheduler, device)

    #Iterate over the validation set and generate predictions
    predicted_labels = []
    for text in val_texts:
        pred = predict(text, model, tokenizer, device)  # Use your predict function here
        predicted_labels.append(pred)
        
    # Calculate MCC
    mcc = matthews_corrcoef(val_labels, predicted_labels)
    mcc_scores.append(mcc)

    # Evaluate model for this fold
    accuracy, report = evaluate(model, val_dataloader, device)
    accuracy_scores.append(accuracy)
    classification_reports.append(report)

    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)
    print(f"MCC: {mcc:.4f}")
