# Dependancies

In [None]:
!pip install torch
!pip install pandas
!pip install numpy
!pip install nltk
!pip install scikit-learn
!pip install torch torchvision torchaudio
!pip install tqdm
!pip install ipython
!pip install matplotlib
!pip install wandb
!pip install torchviz

# Imports


In [None]:
#dataframe
import pandas as pd
import numpy as np

#normalising characters
import unicodedata as ucd

#preprocessing data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')

#organising train/test datasets
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit, KFold
from sklearn.utils.class_weight import compute_class_weight

#ml framework imports
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, RMSprop

#dataset setup
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder

# notebook display
from tqdm import tqdm

#for clearer printing
from IPython.display import Markdown, display
from enum import Enum

#evaluation metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, confusion_matrix, recall_score

#visualisation
import matplotlib.pyplot as plt
import wandb
from torchviz import make_dot

from copy import deepcopy

print(torch.__version__)

## Stylised Printing

In [None]:
class Colours(Enum):
    RED = 'red'
    BLUE = 'blue'
    YELLOW = 'yellow'
    GREEN = 'green'

class MdTypes(Enum):
    HEADING = lambda x: f"# {x}"
    S_HEADING = lambda x: f"## {x}"
    SS_HEADING = lambda x: f"### {x}"
    SSS_HEADING = lambda x: f"#### {x}"
    BOLD = lambda x: f"**{x}**"
    ITALIC = lambda x: f"*{x}*"
    NO_MARK = lambda x: x
    
def printmd(string, extra = None, md:MdTypes = MdTypes.NO_MARK, colour:Colours = Colours.RED):
    string = md("<span style='color:{}'>{}</span>".format(colour.value, string))
    if extra:
        if not isinstance(extra, list):
            string = string + str(extra)
        else:
            for each in extra:
                string = string + ", " + str(each)          
    display(Markdown(string))

# Data loading and formatting

### Load data from csv

In [None]:
df = pd.read_csv("training_data/formatted_data_REMOVED_CHARS.csv")
try:
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('total_time', axis=1)
except:
    print("attempted to delete non existent collumn")
    
cols = df.columns
df.head()

### Save data to csv

In [None]:
#df.to_csv("formatted_data_REMOVED_CHARS_total_time.csv")

# Preprocessing

#### Methods to gather information about character spread and remove unnecessary characters

In [None]:
def get_unique_chars(row):
    for each in row:
        if not isinstance(each, (int,float)): 
            if isinstance(each, list):
                for item in each:
                    #iterate through each character
                    for char in item:                      
                        #print("this is a character", char)
                        if char not in unique_chars:
                            unique_chars.append(char)                            
                            #print(unique_chars)
            else:
                #iterate through each character
                for char in each:
                    if char not in unique_chars:
                        unique_chars.append(char)
                        #print(unique_chars)
    return

def get_unique_chars_merged(row):
    for item in row["merged_text"]:
        for char in item:                 
            if char not in unique_chars:
                unique_chars.append(char)                            
    return

#MANUALLY SELECTED CHARACTERS TO REPLACE
rep_space = {" ": ['r\t', r'\n', '\r', '_', '\xad', '®', '°', '\u200b', '\u2028', '\u2060']}
rep_x = {"x": ['×']}
rep_i = {"i": ['ı']}
rep_apos = {"'": ['ʹ', 'ʼ', '̧̨̛̣̀́̂̃̄̈̉̊̌', '‘', '’', '“', '”', '′', 'ʼ']}
rep_empty = {"":['̀', '́', '̂', '̃', '̄', '̈', '̉', '̊', '̌', '̛', '̣', '̧', '̨']}
rep_fslash = {"/": ['⁄']}
#REPLACEMENT_DICT IS USED TO REPLACE ALL IDENTIFIED CHARS WITH RELEVANT ALTERNATIVES
replacement_dict = {**rep_space, **rep_x, **rep_i, **rep_apos, **rep_fslash}

#NORMALIZES UNICODE CHARACTERS
def decompose_text(row): 
    for each in cols:  
        if not isinstance(row[each], int): 
            if isinstance(row[each], list): 
                #print("shouldmt be here")
                for i in range(len(row[each])):                  
                    row[each][i] = ucd.normalize('NFKD', str(row[each][i]))
                    for reps in replacement_list:
                        for ops in replacement_list[reps]:
                            if ops in row[each][i]:
                                row[each][i] = row[each][i].replace(ops, reps)
            else:        
                row[each] = ucd.normalize('NFKD', str(row[each]))
                for reps in replacement_dict:
                        for ops in replacement_dict[reps]:
                            if ops in row[each]:
                                row[each] = str(row[each].replace(ops, reps))
    return row

def decompose_merged(field): 
    field = ucd.normalize('NFKD', str(field))
    for reps in replacement_dict:
            for ops in replacement_dict[reps]:
                if ops in field:
                    field = str(field.replace(ops, reps))
    return field

words = set(nltk.corpus.words.words())
def nltk_rem_non_eng(row):
    sent = row["merged_text"]
    row["merged_text"] = " ".join(w for w in nltk.wordpunct_tokenize(sent) if w.lower() in words or not w.isalpha())

### Creating new dataframe with columns for model usage

In [None]:
def merged_text_and_token_list_df(df):
    merged_text = (
        "Title: " + df['title'].astype(str) + "\n" +
        "Description: " + df['description'].astype(str) + "\n" +
        "Category: " + df['category'].astype(str) + "\n" +
        "Total Time: " + df['total_time2'].astype(str) + "\n" +
        "Ingredients: " + df['ingredients'].astype(str) +
        "Instruction: " + df['instruction'].astype(str)
    )
    difficulty_mapping = {
    'easy': [1, 0, 0],
    'medium': [0, 1, 0],
    'challenging': [0, 0, 1]
    }
    new_df = pd.DataFrame({'merged_text': merged_text, 'token_list': None, 'numerical_token': None, 'difficulty': df['difficulty'], 'label':df['difficulty'].map(difficulty_mapping)})
    return new_df

### Reviewing chars and updating the text

In [None]:
##LIGATURES HAVE ALREADY BEEN REMOVED PREVIOUSLY##
"""
review_df1 = merged_text_and_token_list_df(df)
review_df2 = merged_text_and_token_list_df(df)

unique_chars = []
review_df1.apply(get_unique_chars_merged,axis=1)
print("Initial Characters:\n", sorted(unique_chars))

review_df1.apply(decompose_merged,axis=1)
unique_chars = []
review_df1.apply(get_unique_chars_merged, axis=1)
print("Manual char removal:\n", sorted(unique_chars))
"""

### tokenization and preprocessing over dataframe and single input

In [None]:
#GLOBAL DEF OF PREPROCESSING COMPONENTS
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]
    
def tokenize_and_process(tok_df):
    
    tok_df["merged_text"] = tok_df["merged_text"].apply(decompose_merged)
    printmd("Merged text Index 0 --manual char removal--\n", tok_df["merged_text"].iloc[0])
    #tokenisation
    if isinstance(tok_df["merged_text"].iloc[0], str):
        tok_df["token_list"] = tok_df['merged_text'].apply(lambda x: word_tokenize(x))
    printmd("\nMerged text Index 0 --tokenisation--\n", tok_df["merged_text"].iloc[0])
    printmd("\nToken list Index 0 --tokenisation--\n", tok_df["token_list"].iloc[0])
    
    #Lowercasing
    tok_df["token_list"] = tok_df["token_list"].apply(lambda x: [token.lower() for token in x])
    printmd("\nToken list Index 0 --lowercasing--\n", tok_df["token_list"].iloc[0])
    
    #Stop word removal
    tok_df["token_list"] = tok_df["token_list"] .apply(lambda x: [word for word in x if word not in stop_words])
    printmd("\nToken list Index 0 --Stop Word Removal--\n", tok_df["token_list"].iloc[0])
    
    tl_pre_lem = tok_df.iloc[0]["token_list"]
    
    ##lemmatisation
    tok_df["token_list"] = tok_df["token_list"] .apply(lemmatize_tokens)
    printmd("\nToken list Index 0 --lemmatization--\n", tok_df["token_list"].iloc[0])

    #check for differences after lemmatisation
    tl_l_post_lem = tok_df.iloc[0]["token_list"]
    count = 0
    for i in range(len(tl_pre_lem)):
        if tl_pre_lem[i] != tl_l_post_lem[i]:
            count +=1
    printmd("\n there are", [str(count) ," differences after lemmatisation"],colour = Colours.BLUE)
    return tok_df

def t_and_p_new_input(input_v):
    iv = decompose_merged(input_v)
    iv = word_tokenize(iv)
    iv = [token.lower() for token in iv]
    iv = [word for word in iv if word not in stop_words]
    iv = lemmatize_tokens(iv)
    return iv

##### UNUSED PREPROCESSING METHODS #####
#Remove whitespace?
#Remove frequent words (not relevant due to the number of techniques that are mentioned within models).
#Spelling correction
#Remove punctuation (not useful due to list nature)
#POS tagging (research why that may be useful)
#Named entity recognition - (useful for ingredients maybe)
#Phrase normalisation (research more)

In [None]:
to_proc_df = merged_text_and_token_list_df(df)
res_df = tokenize_and_process(to_proc_df)
res_df.head()

## Resampling

In [None]:
def undersample(df, sample_sizes):
    classes = df.difficulty.unique()
    classes_sample = []
    
    for cls in classes:
        class_df = df[df['difficulty'] == cls]
        if cls in sample_sizes:
            sample_size = sample_sizes[cls]
            sampled_df = class_df.sample(sample_size, replace=False) if sample_size < len(class_df) else class_df
            classes_sample.append(sampled_df)
    
    final_df = pd.concat(classes_sample).reset_index(drop=True)
    return final_df

def oversample(df, sample_sizes):
    classes = df.difficulty.unique()
    classes_sample = []
    
    for cls in classes:
        class_df = df[df['difficulty'] == cls]
        if cls in sample_sizes:
            sample_size = sample_sizes[cls]
            sampled_df = class_df.sample(sample_size, replace=True) if sample_size > len(class_df) else class_df
            classes_sample.append(sampled_df)
    
    final_df = pd.concat(classes_sample).reset_index(drop=True)
    return final_df

## Splitting Data

In [None]:
printmd("STRATIFIED CLASS DISTRIBUTION with")
strat_train_df,strat_test_df = train_test_split(res_df,test_size=0.1,stratify=res_df['difficulty'],random_state=11)
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{res_df["difficulty"].value_counts() / len(res_df)}\n\n'+
      f'PROPORTION OF TARGET IN THE TRAINING SET\n{strat_train_df["difficulty"].value_counts() / len(strat_train_df)}\n\n'+
      f'PROPORTION OF TARGET IN THE TEST SET\n{strat_test_df["difficulty"].value_counts() / len(strat_test_df)}')

from collections import Counter
classes = np.array(["easy","medium","challenging"])  # Example classes
c_labels = strat_train_df["difficulty"] # Labels from the dataset
CLASS_WEIGHTS = compute_class_weight(class_weight='balanced', classes=classes, y=c_labels)
print("Class weights", CLASS_WEIGHTS)

# Define desired sample sizes for each class
undersample_sizes = {'easy': 3000, 'medium': 1250, 'challenging': 331}
oversample_sizes = {'easy': 3000, 'medium': 1250, 'challenging': 600}

print('Initial training dataset shape %s' % Counter(strat_train_df["difficulty"]))
# Apply undersampling
under_strat_train_df = undersample(strat_train_df, undersample_sizes)
print('Undersampled dataset shape %s' % Counter(under_strat_train_df["difficulty"]))
# Apply oversampling
fin_strat_train_df  = oversample(under_strat_train_df, oversample_sizes)
print('Final dataset shape %s' % Counter(fin_strat_train_df["difficulty"]))

printmd("OVER/UNDERSAMPLED CLASS DISTRIBUTION with")
print(f'PROPORTION OF TARGET IN THE TRAINING SET\n{fin_strat_train_df["difficulty"].value_counts() / len(fin_strat_train_df)}\n\n')

# Preparation for Model Training

### Dataset Definition

In [None]:
class WordEmbeddingDataset(Dataset):
    def __init__(self, p_df, word_to_idx):
        self.p_df = p_df
        self.word_to_idx = word_to_idx

    def __len__(self):
        return len(self.p_df)

    def __getitem__(self, idx):
        sentence = self.p_df['token_list'].iloc[idx]
        label = self.p_df['label'].iloc[idx]  # Assuming 'label' is the column name
        numerical_seq = [self.word_to_idx[word] for word in sentence]
        encoded_label = torch.tensor(label, dtype=torch.float32)  # Ensure float32 for BCE loss
        return torch.tensor(numerical_seq), encoded_label
    
WORD_TO_IDX = {word: idx for idx, word in enumerate(set(word for sentence in res_df["token_list"] for word in sentence))}
MAX_SEQ_LEN = max([len(sentence) for sentence in res_df["token_list"]])

## Model Definition

In [None]:
class RNNClassifierT(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, fc_layers = 1, num_layers=1, bidirectional=False, dropout=0.2):
        super(RNNClassifierT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Embedding layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, 
                          dropout=dropout, batch_first=True, 
                          bidirectional=bidirectional)  # RNN layer
        
        # If bidirectional, the output size of the RNN will be doubled (since it has both forward and backward states)
        if bidirectional:
            rnn_output_dim = hidden_dim * 2
        else:
            rnn_output_dim = hidden_dim
        
        self.fc = nn.ModuleList()
        if fc_layers > 1:
            for i in range(fc_layers - 1):
                self.fc.append(nn.Linear(rnn_output_dim, rnn_output_dim))  # Fully connected layer for classification
        
        self.final_fc = nn.Linear(rnn_output_dim, num_classes)
        
        self.dropout = nn.Dropout(p=dropout)  # Dropout layer to prevent overfitting
        print("model: ",self)

    def forward(self, x):
        # Pass input through embedding layer
        embedded = self.embedding(x)
        if DEBUG:
            print("embedded shape:", embedded.shape)  # Shape: (batch_size, seq_len, embedding_dim)

        # Pass through RNN
        _, h_n = self.rnn(embedded)  # Only interested in hidden states (h_n)
        if DEBUG:
            print("RNN output shape:", h_n.shape)  # Shape: (num_layers * num_directions, batch_size, hidden_dim)

        # Handle multi-layer RNN and bidirectional case
        if self.rnn.bidirectional:
            # Separate forward and backward states from the last layer
            forward_h = h_n[-2]  # Forward hidden state of the last layer
            backward_h = h_n[-1]  # Backward hidden state of the last layer
            h_n = torch.cat((forward_h, backward_h), dim=1)  # Concatenate along feature dimension
        else:
            # Use the last hidden state from the last layer (for unidirectional RNN)
            h_n = h_n[-1]

        if DEBUG:
            print("After processing hidden state shape:", h_n.shape)  # Shape: (batch_size, hidden_dim * num_directions)

        # Pass through fully connected layers
        for layer in self.fc:
            h_n = F.relu(layer(h_n))
            if DEBUG:
                print("After FC shape:", h_n.shape)
            h_n = self.dropout(h_n)
            if DEBUG:
                print("After dropout shape:", h_n.shape)

        # Final output layer
        output = self.final_fc(h_n)
        if DEBUG:
            print("After final FC (output) shape:", output.shape)  # Shape: (batch_size, output_dim)

        # Apply softmax for multi-class classification
        return torch.softmax(output, dim=-1)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim=64, num_classes=3, fc_layers = 1, num_layers=1, bidirectional=False, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Embedding layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.final_fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(p=dropout)  # Dropout layer to prevent overfitting

    def forward(self, sentence):
        print("this is the lstm input:",sentence)
        embeds = self.embedding(sentence)
        print("this is the embeddings:",embeds)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        print("this is the lstm output:",lstm_out)
        tag_space = self.final_fc(lstm_out.view(len(sentence), -1))
        print("this is the final outputs:",tag_space)
        tag_scores = F.log_softmax(tag_space, dim=1)
        print("output probs:",tag_scores)
        return tag_scores

# Early Stopping

In [None]:
class EarlyStopping:
    def __init__(self, patience=4, delta=0, path='checkpoint.pt', verbose=False, monitor=["val_loss"], combine_metrics="weighted_average"):
        """
        Args:
            patience (int): How many epochs to wait after the last improvement in the monitored metric.
            delta (float): Minimum change in the monitored metric to qualify as an improvement.
            path (str): Path to save the best model.
            verbose (bool): If True, prints updates when metrics improve.
            monitor (list): List of metrics to monitor, e.g., ["val_loss", "f1_macro", "f1_weighted"].
            combine_metrics (str): Method for combining metrics. Options: "weighted_average", "average".
        """
        self.patience = patience
        self.delta = delta
        self.path = path
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_metric = None
        self.monitor = monitor
        self.combine_metrics = combine_metrics

    def __call__(self, metrics, model):
        """
        Args:
            metrics (dict): Dictionary of metrics to monitor, e.g., {'val_loss': 0.5, 'f1_macro': 0.75, 'f1_weighted': 0.65}.
            model (nn.Module): The model being trained.
        """
        # Extract the values for the monitored metrics
        metric_values = {metric: metrics.get(metric) for metric in self.monitor}

        # If any monitored metric is missing, raise an error
        if None in metric_values.values():
            raise ValueError(f"One or more of the monitored metrics: {self.monitor} is missing in the provided metrics.")

        # Combine the metrics using the defined combination method
        combined_score = self.combine_metrics_func(metric_values)

        # For metrics where higher is better (like F1, precision, recall), we flip the sign to work with "score".
        if "val_loss" in self.monitor:
            score = -combined_score  # Minimize loss
        else:
            score = combined_score  # Maximize F1, precision, recall, etc.

        # Initialize best_score on first call
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(metrics, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(metrics, model)
            self.counter = 0

    def combine_metrics_func(self, metric_values):
        """
        Combines the given metrics into a single score.
        - 'weighted_average': Calculates a weighted average of the metrics.
        - 'average': Takes the average of the metrics.
        """
        if self.combine_metrics == "weighted_average":
            # Prioritize F1 macro, F1 weighted, and recall (for minority class focus)
            weights = {"f1_macro": 0.4, "f1_weighted": 0.4, "recall_macro": 0.2}  # Adjust weights to emphasize minority class
            weighted_sum = sum([metric_values.get(k, 0) * weights.get(k, 1) for k in metric_values])
            total_weight = sum([weights.get(k, 1) for k in metric_values])
            print(f"Current Score:{weighted_sum / total_weight if total_weight > 0 else 0}")
            return weighted_sum / total_weight if total_weight > 0 else 0
        
        elif self.combine_metrics == "average":
            return sum(metric_values.values()) / len(metric_values)
        
        else:
            raise ValueError(f"Unknown combination method: {self.combine_metrics}")

    def save_checkpoint(self, metrics, model):
        """Saves the model if the monitored metric improves."""
        if self.verbose:
            print(f"Metrics improved. Saving model with metrics: {metrics}")
        torch.save(model.state_dict(), self.path)
        self.best_metric = metrics


# Training

In [None]:
def train_loop(word_to_idx, d_loader, v_loader, max_epoch = 1, rnn_layers = 1, rnn_d = 0, wandb_run = None, fc_layers=1, 
               embed_dim = 100, hidden_dim = 64, bidirect = False, lr = 0.001, op = "Adam",model_n="RNN"):
    
    vocab_size = len(word_to_idx)
    output_size = 3  # Assuming 3 classes

    # Initialize EarlyStopping with metrics to monitor
    early_stopping = EarlyStopping(
        patience=2, 
        monitor=["val_loss", "f1_macro", "f1_weighted", "recall_macro"], 
        combine_metrics="weighted_average", 
        verbose=True
    )

    t_model = None
    # Define the model
    if model_n == "RNN":
        t_model = RNNClassifierT(vocab_size, 
                                 embedding_dim=embed_dim, 
                                 hidden_dim=hidden_dim, 
                                 num_classes = 3, 
                                 num_layers = rnn_layers, 
                                 fc_layers = fc_layers,
                                 dropout = rnn_d,
                                 bidirectional = bidirect)
    if model_n == "LSTM":
        t_model = LSTMClassifier(vocab_size, 
                                 embedding_dim=embed_dim, 
                                 hidden_dim=hidden_dim, 
                                 num_classes = 3, 
                                 num_layers = rnn_layers, 
                                 fc_layers = fc_layers,
                                 dropout = rnn_d,
                                 bidirectional = bidirect)
    
    # Define loss function and optimizer
    criterion = nn.BCELoss(torch.tensor(CLASS_WEIGHTS))
    optimizer = Adam(t_model.parameters(), lr=lr) if op == "Adam" else RMSprop(t_model.parameters(), lr=lr)

    epoch = 0
    for i in range(max_epoch):
        t_model.train()
        print(f"Epoch {epoch+1}")
        t_losses = []
        v_losses = []
        train_loss = 0.0
        for i, batch in enumerate(tqdm(d_loader)):
            inputs, labels = batch
            padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)  
            optimizer.zero_grad()            
            outputs = t_model(padded_inputs)#SQUEEEZZEEE WAS ADDED BY CHATGPT CARE
            loss = criterion(outputs, labels.float())  # Reshape labels to match output size
            train_loss += loss
            loss.backward()
            optimizer.step()
        average_train_loss = train_loss / len(d_loader)
        t_losses.append(average_train_loss)
        epoch += 1
        t_model.eval()
        all_predictions = []
        all_labels = []

        with torch.no_grad():
            val_loss = 0        
            for i, batch in enumerate(v_loader):  
                inputs, labels = batch
                v_outputs = t_model(inputs)
                loss = criterion(v_outputs, labels.float())
                val_loss += loss.item()
                                        
                all_predictions.extend(v_outputs.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
            
            average_val_loss = val_loss / len(v_loader)
            v_losses.append(average_val_loss)
            print(f"Training loss: {average_train_loss}, Validation loss: {average_val_loss}")
             # Calculate metrics
        
            #RETURNS ONE HOT AND INT REPRESENTATION
            all_predictions_hot, predicted_classes = predictions_to_one_hot(torch.Tensor(all_predictions))
            all_labels_t = torch.Tensor(all_labels)
            label_classes = torch.argmax(all_labels_t, dim=1) 
            
            f1_macro = f1_score(all_labels_t, all_predictions_hot, average='macro')
            f1_weighted = f1_score(all_labels_t, all_predictions_hot, average='weighted')
            recall_macro = recall_score(all_labels_t, all_predictions_hot, average='macro')
            accuracy = accuracy_score(all_labels_t, all_predictions_hot)

            # Pack metrics into a dictionary
            metrics = {
                'val_loss': average_val_loss,
                'f1_macro': f1_macro,
                'f1_weighted': f1_weighted,
                'recall_macro': recall_macro,
                'accuracy': accuracy
            }

            early_stopping(metrics, t_model)
            if early_stopping.early_stop:
                print("Early stopping triggered")
                break 
            # Log metrics to W&B
            
            if wandb_run:
                wandb_run.log({
                    "train_loss": average_train_loss,
                    'val_loss': average_val_loss,
                    'f1_macro': f1_macro,
                    'f1_weighted': f1_weighted,
                    'recall_macro': recall_macro,
                    'accuracy': accuracy,
                    "epoch": epoch,
                })
        
        printmd(f"Epoch {epoch+1}: Train Loss = {train_loss}, Validation Loss = {val_loss}")
    
    # Load the best model weights
    t_model.load_state_dict(torch.load('checkpoint.pt'))
    return t_model, t_losses, v_losses

# Evaluation

In [None]:
def predictions_to_one_hot(predictions):
    # Get the indices of the highest probability for each prediction
    predicted_classes = torch.argmax(predictions, dim=1)  # Returns indices of max probability for each sample
    # Convert these indices to one-hot encoded format
    one_hot_predictions = torch.zeros(predictions.size(0), predictions.size(1))  # Initialize a zero tensor
    one_hot_predictions.scatter_(1, predicted_classes.unsqueeze(1), 1)  # Set the correct class to 1
    return one_hot_predictions, predicted_classes

def eval_loop(e_d_loader, e_model, epoch=0):
    e_model.eval()
    with torch.no_grad():
        total_loss = 0
        total_samples = 0
        all_predictions = []
        all_labels = []
        criterion = nn.BCELoss()
        
        for i, batch in enumerate(e_d_loader):  
            inputs, labels = batch
            padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)  
            outputs = e_model(padded_inputs)
            debug = False
            if debug:
                print("num in batch",len(inputs))
                print("Inputs", inputs)   
                print("outputs", outputs)
                print("labels", labels.float())
                print("labels re shaped", labels.float().view(-1, 3))
            loss = criterion(outputs, labels.float())
            total_loss += loss.item()

            all_predictions.extend(outputs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        
        #RETURNS ONE HOT AND INT REPRESENTATION
        all_predictions_hot, predicted_classes = predictions_to_one_hot(torch.Tensor(all_predictions))
        all_labels_t = torch.Tensor(all_labels)
        label_classes = torch.argmax(all_labels_t, dim=1) 
        
        avg_loss = total_loss / len(e_d_loader)

        # Ensure that all_labels and all_predictions are numpy arrays and in correct format
        all_labels = np.array(all_labels)
        all_predictions = np.array(all_predictions)
     
        # Calculate F1-score for multi-label classification
        f1_score_macro = f1_score(all_labels, all_predictions_hot, average='macro')
        f1_score_micro = f1_score(all_labels, all_predictions_hot, average='micro')
        f1_score_weighted = f1_score(all_labels, all_predictions_hot, average='weighted')
        f1_score_samples = f1_score(all_labels, all_predictions_hot, average='samples')
         
        # Calculate Accuracy
        accuracy = accuracy_score(all_labels, all_predictions_hot)
        
        # Calculate Precision
        precision_macro = precision_score(all_labels, all_predictions_hot, average='macro', zero_division=0)
        precision_micro = precision_score(all_labels, all_predictions_hot, average='micro', zero_division=0)
        precision_weighted = precision_score(all_labels, all_predictions_hot, average='weighted', zero_division=0)       
        
        #confusion matrix
        class_names = ["Easy", "Medium", "Challenging"]
        cm = confusion_matrix(label_classes, predicted_classes)
        # Create a DataFrame for better visualization
        cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
   
        # Print metrics
        printmd(f"Epoch {epoch+1}")
        printmd(f"Validation Loss: {avg_loss:.4f}\n")
        printmd(f"Accuracy: {accuracy:.4f}")
        printmd(f"Precision (Macro): {precision_macro:.4f}, Precision (Micro): {precision_micro:.4f}")
        printmd(f"Precision (Weighted): {precision_weighted:.4f}\n")
        printmd(f"F1-Score (Macro): {f1_score_macro:.4f}, F1-Score (Micro): {f1_score_micro:.4f}")
        printmd(f"F1-Score (Weighted): {f1_score_weighted:.4f}, F1-Score (Samples): {f1_score_samples:.4f}\n")
        printmd(f"Multilabel Confusion Matrix:\n{cm_df}\n")
        

### Method in dataloader to pad sequences

In [None]:
def pad_collate(batch):
    """
    Custom collate function to apply padding per batch.
    """
    inputs, labels = zip(*batch)
    
    # Pad the input sequences to the same length
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)  # pad with 0
    
    # Stack the labels (assuming labels are already one-hot encoded or scalar labels)
    padded_labels = torch.stack(labels, dim=0)
    
    return padded_inputs, padded_labels

### Train and Evaluate model

In [None]:
train_dataset_h = WordEmbeddingDataset(fin_strat_train_df, WORD_TO_IDX)
test_dataset_h =  WordEmbeddingDataset(strat_test_df, WORD_TO_IDX)  

# Create the DataLoader
train_dataloader = DataLoader(train_dataset_h, batch_size=32, shuffle=True,collate_fn=pad_collate)
val_dataloader = DataLoader(test_dataset_h, batch_size=32, shuffle=True, collate_fn=pad_collate)
DEBUG = False  
rm, _, _ = train_loop(WORD_TO_IDX, train_dataloader, val_dataloader, embed_dim =100,hidden_dim = 64,
                      max_epoch = 10,rnn_layers = 3, rnn_d=0.2, fc_layers=3, bidirect=True, lr = 0.001, op="Adam", model_n="RNN")

eval_loop(val_dataloader, rm)


# Hyper Parameter Tuning

In [None]:
# Define the hyperparameter space
HYPER_PARAMS = {
    #"embedding_dim": [50, 100, 200],
    "hidden_dim": [32, 64, 128],
    "num_layers": [1, 2, 3],
    "bidirectional": [True, False],
    "dropout": [0.3],
    "learning_rate": [1e-3, 1e-4],
    "batch_size": [16, 32, 64],
    "fc_layers": [1,2,3],
    "optimizer": ["Adam", "RMSprop"]
}

train_dataset = WordEmbeddingDataset(fin_strat_train_df, WORD_TO_IDX)
test_dataset =  WordEmbeddingDataset(strat_test_df, WORD_TO_IDX)  
# Create the DataLoader

# Iterate through combinations
def iterate_combinations(params_dict):
    keys = list(params_dict.keys())
    values = list(params_dict.values())
    total_combinations = 1
    for v in values:
        total_combinations *= len(v)

    print(f"Total combinations: {total_combinations}")

    # Initialize visualization tools
    wandb.init(project="hyperparameter_tuning", name="all_combinations")
    
    # Store results for final visualization
    results = []

    # Generate parameter combinations without itertools
    def generate_combinations(idx=0, current_combination={}):
        if idx == len(keys):
            yield deepcopy(current_combination)
            return
        
        key = keys[idx]
        for value in params_dict[key]:
            current_combination[key] = value
            yield from generate_combinations(idx + 1, current_combination)

    for combination in generate_combinations():
        # Extract hyperparameters
        #embedding_dim = combination["embedding_dim"]
        hidden_dim = combination["hidden_dim"]
        num_layers = combination["num_layers"]
        bidirectional = combination["bidirectional"]
        dropout = combination["dropout"]
        learning_rate = combination["learning_rate"]
        batch_size = combination["batch_size"]
        optimizer_name = combination["optimizer"]
        fc_layers = combination["fc_layers"]
        
        print(f"Current Combination: {combination}")
        
        # Log hyperparameters to W&B
        wandb.config.update(combination)
        
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,collate_fn=pad_collate)
        val_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
        
        rm,train_losses, val_losses = train_loop(WORD_TO_IDX, 
                        train_dataloader, 
                        val_dataloader, 
                        max_epoch = 10,
                        rnn_layers = num_layers,
                        rnn_d = dropout,
                        wandb_run = wandb,
                        fc_layers = fc_layers,
                        embed_dim = 100,
                        hidden_dim = hidden_dim,
                        bidirect = bidirectional,
                        lr = learning_rate,
                        op = optimizer_name)
        
        wandb.log({
            "final_train_loss": train_losses[-1],
            "final_val_loss": val_losses[-1],
        })

def visualize_results(results):
    plt.figure(figsize=(12, 6))
    for res in results:
        combination = res["combination"]
        train_loss = res["train_loss"]
        val_loss = res["val_loss"]
        label = f"{combination}"
        plt.plot(train_loss, label=f"Train - {label}")
        plt.plot(val_loss, label=f"Val - {label}")
    
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Hyperparameter Tuning Loss Trends")
    plt.legend(loc="best")
    plt.show()

# Run parameter iteration
iterate_combinations(HYPER_PARAMS)

## (OLD) Prediction method

In [None]:
def predict(pmodel, text):
    # Tokenize the input text
    tokens = t_and_p_new_input(text)
    
    # Convert tokens to numerical sequence
    numerical_seq = [WORD_TO_IDX.get(word, 0) for word in tokens]  # 0 for unknown words
    max_seq_len = len(WORD_TO_IDX)
    # Pad the sequence to the maximum length
    # Ensure sequence length is at least max_seq_len
    #padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)  
    padded_seq = numerical_seq[:max_seq_len]
    while len(padded_seq) < max_seq_len:
        padded_seq.append(0)  # Pad with 0s
    padded_seq = torch.tensor(padded_seq)
    #padded_seq = pad_sequence([torch.tensor(numerical_seq)], padding_value=0, batch_first=True, max_length=max_seq_len)

    # Create a batch and convert to a tensor
    batch = [padded_seq]
    batch = torch.stack(batch)

    # Set the model to evaluation mode
    pmodel.eval()
    with torch.no_grad():
        output =pmodel(batch)
        print("model output", output)
        predicted_class = torch.argmax(output, dim=1)
        print("predicted_class", predicted_class)
        return predicted_class.item()

# Example usage:

##########################################
######## UPDATE FOR NEW PADDING ####################################################
################################################## UPDATE FOR NEW PADDING ##########
                                          ##########################################  
med_input = "Title: Veal sweetbreads with ras el hanout, cauliflower purée, watercress and coriander\nDescription: Marcus Eaves' beautiful sweetbread dish masterfully balances delicately flavoured sweetbreads and cauliflower with bright and bold coriander, pomegranate and mint.\nCategory: Starter\nTotal Time: 90\nIngredients: 100g of veal sweetbreads, 70g of T45 flour, 5g of ras el hanout, 1 garlic clove, 1 sprig of thyme, 1 tbsp of butter, plus beurre noisette butter, 1 tsp pine nuts, parsley, chopped, lemon juice, vegetable oil, salt, pepper, 200g of cauliflower, finely chopped, 150g of milk, 100g of double cream, 3g of salt, 120ml of lemon olive oil, 30g of lemon vinegar, salt to taste, 150g of cauliflower, grated on a microplane, salt, 2 tbsp of golden raisins, soaked, 1 tbsp of pine nuts, 1 tbsp of pomegranate seeds, mint leaves, chopped, coriander, finely chopped, 1 cauliflower, butter, 30ml of chicken stock, baby watercress, washed and picked, coriander, olive oilInstruction: Start by making the cauliflower crisp to garnish the dish. Use a mandoline to cut 2mm thick slices of half of the cauliflower set aside for garnish. Lay the slices out onto a non-stick mat and place in a dehydrator at 60°C until the cauliflower is crisp, for approximately 4-6 hours. For the cauliflower purée, add the milk, double cream and salt to a pan and bring to a simmer. Add the cauliflower and cook until tender and soft to the touch, this will take 5-10 minutes. Allow to cook until about half of the liquid has evaporated, then tip the contents of the pan into a strainer, saving the liquid. Blend the cauliflower with half of the remaining liquid, adding more to achieve the desired consistency. Pass through a fine sieve and set aside until ready to serve. For the lemon vinaigrette, simply whisk together the ingredients until smooth. Ensure to mix well again before using later. For the cauliflower couscous, cook the cauliflower in salted boiling water until tender for 2-3 minutes, strain and refresh in ice water. Drain and mix in the golden raisins and pine nuts. Toss to combine and season to taste with the lemon vinaigrette. Set aside. To cook the sweetbreads, preheat a water bath to 63 ̊C. Season evenly with salt and pepper and vacuum seal. Cook in the water bath for 11 minutes and cool in ice cold water. Then, remove from the pouch and pat dry. Season the cooked sweetbreads and roll in a mix of the flour and ras el hanout. Add a generous film of olive oil to a pan and place over a high heat. Once hot, caramelise the sweetbreads on one side and turn over. Add a tablespoon of the butter, the garlic and the thyme sprigs and use a spoon to continuously baste the sweetbreads in the foaming butter for another 1-2 minutes. Remove the sweetbreads from the pan, rest for 45 seconds then carve. Drain the fat from the pan, wipe dry with kitchen paper and return to the heat. Make a beurre noisette with another knob of butter and add the pine nuts, chopped parsley and a generous squeeze of lemon juice. Remove from the heat and set aside. Cut the remaining cauliflower half for garnish into medium sized florets. Blanch the pieces of cauliflower in salted boiling water for up to 1 minute, then remove and set aside to dry. Place a small pan over a medium heat and add a knob of butter and the chicken stock. Reduce to a glaze, stirring with a wooden spoon to maintain its texture. Remove from heat and place in a small bowl. Roll the blanched cauliflower through this chicken stock and butter emulsion until well-coated. To serve, add the fresh pomegranate, mint and coriander to the cauliflower couscous. Place one heaped tablespoon of the couscous on each plate and place the buttered cauliflower around the edge. Add slices of sweetbread to cover the cauliflower couscous. Dress the sweetbreads with the beurre noisette and decorate the plate with baby watercress, picked coriander leaves and dehydrated cauliflower. Finish with quenelles of the cauliflower purée and a drizzle of olive oil"
chal_input = "Title: Cured sea trout with garden pea, nasturtium and langoustine dashi\nDescription: Phil Fanning celebrates spring's finest fish with great style in this recipe, featuring cured fillets and a sea trout tortellini \nCategory: Main\nTotal Time: 150\nIngredients: 250g of 00 flour, 150g of egg yolk, 15g of milk, 13g of olive oil, 1g of ground sea salt, 2 bunches of fresh tarragon, 200g of caster sugar, 200g of grey salt, 50g of dried shiitake mushrooms, 500g of sea trout, skinless, 88g of sea trout, frozen, 7g of Maldon salt, 15g of egg white, 150g of whipping cream, 100g of frozen peas, 500g of langoustine shells, washed and crushed, 500g of fish stock, 100g of white chicken stock, 50g of leek, washed and sliced, 150g of banana shallot, sliced, 70g of sake, 1 kombu, sheet, 25g of shiitake mushrooms, 1 dash of soy sauce, to taste, 1 bunch of fresh tarragon, to refresh, rice vinegar, dash, egg white, to clarify, 30g of garlic, peeled, 500 morels, 600g of white wine, 340g of white balsamic vinegar, 900ml of ice cold water, 40g of kombu, 150g of caster sugar, 10g of black peppercorns, 24g of fresh thyme, 8g of fresh tarragon, 8 bay leaves, 1 handful of nasturtium flowers and leaves, 1 handful of fresh peas, trout roe, to garnishInstruction: To make the pasta, bring all of the ingredients together and knead well for 10 minutes. Place the dough in a bag and rest in the fridge for 30 minutes. Place the tarragon, sugar, salt and shiitakes in a food processor and blitz for 1-2 minutes. Rub the marinade into the fish and leave to cure for 1 hour. After 1 hour, wash the marinade off and pat dry\r\n. Portion the fish and reserve until ready to cook. To make the trout filling, blitz the frozen trout in a food processor to a fine powder. Scrape down the edges of the bowl, then add the egg white and continue to blitz until you have a smooth trout purée. Transfer this purée to a bowl over ice and fold the cream in without over-working. Add the salt and mix well. Place in the fridge for 10 minutes, then fold in the peas and chill. To pickle the morels, bring all of the wet ingredients to a boil in a medium sized pan. Add the sugar and spices and stir to combine. Add the garlic, remove from the heat and leave to cool. Add the fresh herbs and mushrooms, seal in a vac pac bag and leave to sit for a couple of hours, or until ready to plate. To make the langoustine dashi, add the fish stock to a medium sized pan. Reduce by half over a medium heat and set aside until ready to use. Drain the langoustine shells well and pan-fry in a medium pan in a little oil until they have dried and are starting to colour. Add the shallots and leeks to the crushed shells and cook for 5 minutes until the onions have softened. Add the sake and cook for a further minute until all of the alcohol has been burnt off. Add the stocks to a large pan over a medium heat and, 1/3 at a time, add the shells, stirring to combine. Add the kombu, shiitake, rice vinegar and soy, bring to a simmer and remove from the heat. Clairfy with egg white, pass through a muslin cloth and refresh with tarragon leaves. Set aside until ready to use. Remove the pasta from the fridge and roll flat with a rolling pin. Using a pasta machine, continue to progressively stretch and thin your pasta dough until it is as thin as a piece of paper. Using a 5 inch diameter metal cutter, cut circles out of the sheet of pasta. Remove the trout filling from the fridge and while still cold, add a tablespoon of the mousse to each round of pasta, being careful not to overfill the tortellini. Lightly dip your finger into water and run it along the edge of the round pasta shape to moisten the edges. Fold the dough over to form an inflated 'D' and then pull the edges together to form a crossed bow. Dust with flour and set aside on a baking tray dusted liberally with flour. Preheat the oven to 160°C/gas mark 3. Lightly dress the cured trout fillets in oil and place in the oven for 15-20 minutes, until lightly baked and flaking when touched. Bring a pan of salted water to the boil, reduce to a delicate simmer and carefully add the tortellini. Simmer for 2-3 minutes or until al dente. Carefully remove from the water and drain. When ready to plate, reheat the dashi at a simmer. Place the trout in the middle of the plate, add the tortellini and a pickled morel. Add a dash of warm dashi and finish with fresh peas, roe and nasturtium flowers"
predicted_class = predict(rm,chal_input)
print("Predicted Class:", predicted_class)

## Model Architecture Visualisation

In [None]:
#from torchviz import make_dot
"""
# create some sample input data
example_x,example_y = train_dataset[0]
print(example_x)
print(example_y)

output = res_model(example_x)

# generate a model architecture visualization
make_dot(output.mean(),
         params=dict(res_model().named_parameters()),
         show_attrs=True,
         show_saved=True).render("MyPyTorchModel_torchviz", format="png")
"""
# create some sample input data
#example_x, example_y = train_dataset[0]
# Create a dummy input tensor
batch_size = 32
VOCAB_SIZE = len(WORD_TO_IDX)
dummy_input = torch.randint(0, VOCAB_SIZE, (batch_size, 100))

# Forward pass
out = rm(dummy_input)

#ENSURING VALS ARE CORRECT SHAPE
print(dummy_input.shape)  # Should be (batch_size, MAX_SEQ_LEN)
print(rm)          # Check model structure to confirm expected input shape

#ENSURING DEVICES ARE THE SAME
dummy_input = dummy_input.to('cpu')  # or 'cuda' if using GPU
res_model = rm.to('cpu')


In [None]:
# Visualize the computation graph
graph = make_dot(out, params=dict(res_model.named_parameters()))
graph.render("RNNClassifier_Architecture", format="png")