# Fine tune distilbert to perform Text classification 

This notebook is intended to train `text-classification` models based on `distilbert base uncased` model. To do so we are using [Transformers 🤗🤗](https://huggingface.co/docs/transformers/index).

### Considerations
- The dataset must have column "text" where all the input questions are setted
- An `S3 Instance` is required to correctly store the model

#### Install required libs   📥📥

In [None]:
!pip install transformers datasets evaluate accelerate  mlflow tf-keras seaborn optimum[openvino,nncf,exporters] psutil pynvml -q

## Dataset manipulation & env preparation

In [None]:
import sys
from pathlib import Path

notebook_dir = Path().resolve()
sys.path.append(str(notebook_dir.parents[1]))

In [None]:
from nlp import Trainer

In [None]:
import pandas as pd
import os
import json

input_column_name="text"
labeled_dataset = "datasets/dataset.csv"
df = pd.read_csv(labeled_dataset)
file_label2id = open('datasets/label2id.json')
file_id2label = open('datasets/id2label.json')
label2id = json.load(file_label2id)
id2label=json.load(file_id2label)
df.head()
df['label'] = df[output_column_name].replace(label2id)
df.head(3)
print(f"The label2id json loaded correctly: {label2id}")
print(f"The id2label json loaded correctly: {id2label}")

## Give a name to your model and version  🧙‍♂️🧙‍♂️

This process is crucial mainly because a `text-classification` model can be intended for a huge amount of approaches

In [None]:
model_name = "intents-copa"
MLFLOW_EXPERIMENT = "showcases"
base_model = 'distilbert-base-uncased'
MLFLOW_RUN_NAME = "V1 Intents for copa model"

In [None]:
trainer = Trainer(dataset=df,label2id,id2label,base_model,model_name,MLFLOW_EXPERIMENT,MLFLOW_RUN_NAME)

In [None]:
### Fine tune model

trainer.train()

### Batch size per epoch

So if you have a batch size of 20 then 

total_dataset/batch_size = n

n represents the total amount of batches per epoch

### How many times does my model going to be trained?

n*epochs

In [None]:
## Save pytorch 
trainer.save_model(model_name)

In [None]:
experiment = mlflow.get_experiment_by_name(experiment)
filter_string = f"tags.mlflow.runName = '{run_name}'"
runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string=filter_string
)

# Extract the run_id from the DataFrame
if not runs.empty:
    previous_run_id = runs.iloc[0]['run_id']
    print(f"Run ID: {previous_run_id}")
else:
    print("No run found with the specified name.")

In [None]:
import mlflow.data
from mlflow.data.pandas_dataset import PandasDataset
from optimum.onnxruntime import ORTModelForSequenceClassification
from pathlib import Path
import onnx

In [None]:
def uploadModel(run_id:str):
    train_dataset: PandasDataset = mlflow.data.from_pandas(df_train, source="Label Studio")
    test_dataset: PandasDataset = mlflow.data.from_pandas(df_test, source="Label Studio")
    with mlflow.start_run(run_id=previous_run_id) as run:
        ORTModelForSequenceClassification.from_pretrained(model_name,export=True).save_pretrained(f"{model_name}_onnx")
        tmp_dir = Path(f"{model_name}_onnx")
        mlflow.log_artifacts(tmp_dir, artifact_path=model_name)
        mlflow.log_artifact("confusion_matrix.png",artifact_path=model_name)
        tokenizer.save_pretrained(f"{model_name}_onnx")
        onnx_model = onnx.load_model(f"{model_name}_onnx/model.onnx")
        model_info = mlflow.onnx.log_model(onnx_model,model_name,registered_model_name=model_name)
        mlflow.log_input(train_dataset, context="training")
        mlflow.log_input(test_dataset,context="validation")
        mlflow.end_run()

In [None]:
uploadModel(previous_run_id)

In [None]:
mlflow.end_run()

In [None]:
## Delete directories in Jupyter Notebook
import shutil

# Remove the local model directory
shutil.rmtree(model_name)
shutil.rmtree(run_name)
os.remove(labeled_dataset)
shutil.rmtree(f"{model_name}_onnx")
os.remove("datasets/label2id.json")
os.remove("datasets/id2label.json")
os.remove("./confusion_matrix.png")

---