# IMPORTS

In [9]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
import logging
import typing
import gc
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelForMaskedLM
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [2]:
# if you are running this notebook from 'home/dev/enefit/notebook'. 
os.chdir('..') # else adjust to point to the root of the project.

In [9]:
%%capture output

%load_ext kedro.ipython
%reload_kedro

if 'output' in locals() and 'error' in output.stderr:
    output.show()

In [11]:
%%capture output
train = catalog.load('train_data_final')
test = catalog.load('test_data_final')

In [None]:
def drop_useless(**kwargs) -> None:
    """Keep only useful columns in the dataframe.

    Args:
        data (str): Name of the dataset as defined in the catalog
    """
    logger = logging.getLogger(__name__)
    for name, item in kwargs.items():
        filename = str(name + "_prim.parquet")
        data = item[['Narrative', 'Anomaly', 'Synopsis']]
        try:
            logger.info("Storing the new dataset from %s in parquet format.", filename)
            data.to_parquet(os.path.join("data/03_primary", filename))
        except AttributeError as e:
            logger.error("Error occured : %s", e)

In [13]:
def dropna_row(df:pd.DataFrame, sub:list) -> pd.DataFrame:
    """ Drop the row if there any missing value in either column in sub

    Args:
        df (pd.DataFrame): The dataframe
        sub (list): List of columns where to look for missings

    Returns:
        pd.DataFrame: _description_
    """
    data = df.dropna(axis = 0, subset=sub)
    return data

In [23]:
def encode_cell(cell:pd.Series, labels:list) -> pd.Series:
    """Encode the multilabels cell such that the cell content is replaced by \n
    a list of same length as labels and containing 0/1.

    Args:
        cell (pd.Series): cell containing the multilabel target
        labels (list): actual list of labels to classify.

    Returns:
        pd.Series: Expand of the cell with number of cols\n
        equal to number of element in labels.
    """
    cell_anomalies = [item.strip() for item in cell.split(';')]
    splited_cell_anomalies = {label: any(item.startswith(labels) for item in cell_anomalies) for label in labels}
    return pd.Series(splited_cell_anomalies)

In [None]:
def target_encoder(df:pd.DataFrame, target:str, labels:list) -> pd.DataFrame :
    """Encode the multilabels cells such that each cell is replaced by \n
    a list of same length as labels and containing 0/1.

    Args:
        df (pd.DataFrame): Task dataframe containing the multilabel target
        target (str): The multilabel target in df
        labels (list): actual list of labels to classify.

    Returns:
        pd.DataFrame: The dataframe with the encoded target
    """
    data = df
    encoding_series = data[target].apply(lambda cell: encode_cell(cell, labels))
    data[target] = encoding_series.values.tolist()
    return data

In [None]:
class CustomDataset(Dataset):
    """PyTorch custom Dataset class. The PyTorch DataLoader will wrap an iterable\n
    around this CustomDataset to enable easy access to the samples.
    """
    def __init__(self,
                 dataframe: pd.DataFrame,
                 tokenizer: BertTokenizer,
                 max_len: int) -> None:
        """ This function is run once when instantiating\n
        the Dataset object. 

        Args:
            dataframe (pd.DataFrame): Dataset object
            tokenizer (BertTokenizer): Tokenizer
            max_len (int): Model max lengh
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.narrative = dataframe.Narrative
        self.targets = self.data.Anomaly
        self.max_len = max_len


    def __len__(self) -> int:
        """Returns the number of samples in the dataframe.

        Returns:
            int: number of samples in the dataframe
        """
        return len(self.narrative)


    def __getitem__(self, index:int) -> dict:
        """Loads and returns a sample from the dataframe\n
        at the given index. 

        Args:
            index (int): index

        Returns:
            dict: Training inputs
        """
        narrative = str(self.narrative.iloc[index])
        narrative = " ".join(narrative.split())

        inputs = self.tokenizer(
            narrative,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [None]:
class BERTClass(torch.nn.Module):
    """PyTorch neural network model.
    """
    def __init__(self):
        """ This function is run once when instantiating\n
        the Dataset object.
        """
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
        self.l2 = nn.Dropout(0.3)
        self.l3 = nn.Linear(768, 14)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids,
                             attention_mask = mask,
                             token_type_ids = token_type_ids,
                             return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

In [None]:
def train_bert_model(model,
                     loss_func,
                     optimizer,
                     epochs,
                     dataloader,
                     device) :
    epoch = 1
    while epoch <= epochs :
        model.train() # tell PyTorch i'm training the model
        size = len(dataloader.dataset)
        for batch, data in enumerate(dataloader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            optimizer.zero_grad()
            loss = loss_func(outputs, targets)
            if batch % 1000 == 0:
                current = (batch + 1) * len(targets)
                print(f"Epoch: {epoch}, loss: {loss.item():>7f}  [{current:>5d}/{size:>5d}]")
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        epoch += 1

In [None]:
def eval_pt_model(model,
                  epochs,
                  testingloader,
                  device):
    epoch = 1
    while epoch <= epochs :
        model.eval()
        fin_targets=[]
        fin_outputs=[]
        with torch.no_grad():
            for _, data in enumerate(testingloader, 0):
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.float)
                outputs = model(ids, mask, token_type_ids)
                fin_targets.extend(targets.cpu().detach().numpy().tolist())
                fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        fin_outputs = np.array(fin_outputs) >= 0.5
        accuracy = metrics.accuracy_score(fin_targets, outputs)
        f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
        f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
        print(f"Accuracy Score = {accuracy}")
        print(f"F1 Score (Micro) = {f1_score_micro}")
        print(f"F1 Score (Macro) = {f1_score_macro}")
        epoch += 1

In [None]:

safeAeroTokenizer = AutoTokenizer.from_pretrained("NASA-AIML/MIKA_SafeAeroBERT")
safeAeroModel = AutoModelForMaskedLM.from_pretrained("NASA-AIML/MIKA_SafeAeroBERT")
bertTokeniser = BertTokenizer.from_pretrained('bert-base-uncased')
bertModel = BertModel.from_pretrained('bert-base-uncased')

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 8
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 8
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
training_set = CustomDataset(train_df, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_df, tokenizer, MAX_LEN)

In [None]:
model = BERTClass()
model.to(device)

In [None]:
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

