In [None]:
#!pip install accelerate==0.25.0
#!pip install bertopic==0.15.0
#!pip install datasets==2.14.4
#!pip install faiss-cpu==1.7.4
#!pip install langchain==0.0.348
#!pip install langchainhub==0.1.14
#!pip install sentence-transformers==2.2.2
#!pip install sentencepiece==0.1.99
#!pip install transformers==4.24.0
!pip install wandb

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import transformers

from transformers import AutoTokenizer, AutoModelForSequenceClassification, PreTrainedTokenizer, PreTrainedTokenizerFast
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
class CustomDataset(Dataset):
    """
    This class designs logic to retrieve data from a custom dataset.
    According to pytorch Dataset conception any map style dataset
    should implement at least __len__ and __getitem__ methods.
    """

    def __init__(
        self, texts, labels, tokenizer, max_length
    ) -> None:
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        """
        returns number of rows in data
        """
        return len(self.texts)

    def __getitem__(self, idx: int) -> tp.Dict[str, tp.Any]:
        """
        retrieves data for single index.
        may include data processing and transformations.
        E.g. augmenting data or tokenizing it.
        returns dict with keys "input_ids", "label" and probably some more metadata (you decide whethere you need something more here)
        """
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }


class ModelTrainer:
    """
    This class implements logic run an experiemnt with a provided transformers classification model.
    It incudes following components:
    - load data
    - load and configure a model and its artifacts
    - train model
    - validate model
    - save model
    - compue metrics
    - run_experiment (as the man entrypoint to execute all flow)

    Attention: current module intentionally doesnt support model inference or model serving.
    It is a good practice to separate train/inference classes otherwise it is hard to maintain it all.

    """

    def __init__(self, model_name: str, dataset_name: str) -> None:
        self.model_name = model_name
        self.dataset_name = dataset_name

    def configure_optimizer(self, optimizer, params: tp.Dict) -> None:
        """
        adds a self.optimizer attribute with a chosen optimizer and its params.
        """

        self.optimizer = optimizer
        self.optimizer_params = params

    def configure_scheduler(self, scheduler) -> None:
        """
        adds a self.scheduler attribute with a chosen scheduler (e.g. ReduceLROnPlateau).
        """

        self.scheduler = scheduler

    def apply_data_parallel(self) -> None:
        """
        checks number of available cuda devices,
        if number of GPUs is > 1, moves self.model to a DataParallel state for faster training.
        """
        if torch.cuda.device_count()>1:
            net = torch.nn.DataParallel(self.model)

    def load_data(self, filename: str, split: str) -> pd.DataFrame:
        """
        uses Datasets library to load a dataset, takes as input dataset name (e.g. "imdb")
        and a split. Loads data into pandas.
        """

  
        pass

    def train(self, dataset: CustomDataset) -> None:
        """
        YOUR CODE HERE
        """
        pass

    def validate(self, dataset: CustomDataset) -> tp.Dict[str, tp.Iterable]:
        """
        takes a trained model and runs it on validation data.
        Returns a dict with the keys "valid_labels" and "valid_preds" and corresponding values.
        """

        """
        YOUR CODE HERE
        """
        pass

    def compute_metrics_report(
        self, labels: tp.Iterable, predictions: tp.Iterable
    ) -> tp.Any:
        """
        Computes classification metric (or several metrcis) for given task.
        """

        """
        YOUR CODE HERE
        """
        pass

    def save_model(self, dst_path: str) -> None:
        """
        Saves model to dst_path. Be careful to check if a model is on DataParallel state.
        If it is, one needs to process it accordingly.
        """

        """
        YOUR CODE HERE
        """
        pass

    def run_experiment(self):
        """
        Main entrypoint.
        Runs the flow from loading data to computing metrics.
        """

        """
        YOUR CODE HERE
        """
        pass


if __name__ == "__main__":
    """run experiment"""
    model_trainer = ModelTrainer(...)
    model_trainer.run_experiment()