### Alquimia text classification training 🤗🤗

#### Setup credentials & dependencies

In [1]:
!pip install transformers==4.28.0 datasets evaluate accelerate ipywidgets jupyterlab_widgets -q

In [2]:
!git config --global credential.helper store

In [4]:
!pip install mlflow -q

In [5]:
import mlflow,os

In [None]:
# Access token through https://huggingface.co/settings/tokens
from huggingface_hub import notebook_login

notebook_login()

In [None]:
## Load some important variables
import ipywidgets as widgets
from IPython.display import display

# Creating the widgets
test_size_slider = widgets.FloatSlider(
    value=0.2,
    min=0,
    max=1.0,
    step=0.05,
    description='Test Size:',
    continuous_update=False
)

number_of_labels_spinner = widgets.BoundedIntText(
    value=2,
    min=1,
    max=100,
    step=1,
    description='Number of Labels:'
)

label_column_name_text = widgets.Text(
    value='label',
    placeholder='Type something',
    description='Label Column:',
    disabled=False
)

text_column_name_text = widgets.Text(
    value='text',
    placeholder='Type something',
    description='Text Column:',
    disabled=False
)

# Button to set the variables
def set_variables(button):
    global test_size, number_of_labels, label_column_name, text_column_name
    test_size = test_size_slider.value
    number_of_labels = number_of_labels_spinner.value
    label_column_name = label_column_name_text.value
    text_column_name = text_column_name_text.value
    print(f"Set values: Test Size: {test_size}, Number of Labels: {number_of_labels}, Label Column: {label_column_name}, Text Column: {text_column_name}")

set_button = widgets.Button(description="Set Variables")
set_button.on_click(set_variables)

# Displaying the widgets
display(test_size_slider, number_of_labels_spinner, label_column_name_text, text_column_name_text, set_button)

In [None]:
try:
    print(f"Test Size: {test_size}")
    print(f"Number of Labels: {number_of_labels}")
    print(f"Label Column Name: {label_column_name}")
    print(f"Text Column Name: {text_column_name}")
except NameError:
    print("Variables not set yet. Please set them using the widgets and the 'Set Variables' button in the previous cell.")

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Create the FileUpload widget
upload = widgets.FileUpload(
    accept='.csv',  # Accept only .csv files
    multiple=False  # Do not allow multiple file uploads
)

display(upload)

In [None]:
## Once the first csv was uploaded we retrieve its data and store it in a dataframe
import pandas as pd
import io

# Check if a file has been uploaded
if upload.value:
    # Retrieve uploaded file details
    uploaded_filename = next(iter(upload.value))
    content = upload.value[0]['content']

    # If content is a memoryview, convert to bytes and then decode
    if isinstance(content, memoryview):
        content = content.tobytes().decode('utf-8')
    
    df = pd.read_csv(io.StringIO(content))
    print("File uploaded successfully!")
else:
    print("No file uploaded.")

In [None]:
## Show first three registers
df.head(3)

### Define mapping 

Here you have define a mapping so the model can be properly trained  lets see an example 

```python
category_to_label = {
    'availability': 0,
    'irrelevant': 1,
    'post sale': 2,
    'invoice':3,
    'service':4,
    'pricing':5,
    'general':6,
    'cancelation policy':7,
    'cancel reservation':8
}
```

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output

def create_mapping_interactively():
    # Create text box for categories and labels
    category_text = widgets.Text(value='', placeholder='Enter category', description="Category:")
    label_text = widgets.IntText(value=0, description="Label:")
    add_button = widgets.Button(description="Add")
    generate_button = widgets.Button(description="Generate Dictionary")
    output = widgets.Output()

    # Temporary storage for categories and labels
    temp_storage = {}

    # Function to handle adding category and label
    def add_category_label(b):
        with output:
            clear_output(wait=True)
            temp_storage[category_text.value] = label_text.value
            print("Current Mapping:")
            for key, val in temp_storage.items():
                print(f"{key} -> {val}")
            category_text.value = ''  # Clear the category input box for new entry
    
    # Function to finalize the dictionary
    def generate_dictionary(b):
        with output:
            clear_output(wait=True)
            print("Final Dictionary:")
            print(temp_storage)
            global category_to_label
            category_to_label = temp_storage  # Saving the mapping to a global variable

    # Link button actions to respective functions
    add_button.on_click(add_category_label)
    generate_button.on_click(generate_dictionary)

    # Display the widgets
    display(category_text, label_text, add_button, generate_button, output)

create_mapping_interactively()


In [None]:
category_to_label

In [None]:
# Add the new 'label' column to the dataframe by mapping values from the 'category' column
df['label'] = df[label_column_name].replace(category_to_label)
df=df.drop('annotation_id',axis=1)
df=df.drop('annotator',axis=1)
df=df.drop('created_at',axis=1)
df=df.drop('id',axis=1)
df=df.drop('lead_time',axis=1)
df=df.drop('updated_at',axis=1)
df.head(3)

### Train/Test split

In [None]:
from sklearn.model_selection import train_test_split


df_train,df_test=train_test_split(df,test_size=test_size)

### Convert to HuggingFace dataset

In [None]:
from datasets import Dataset

train_dataset=Dataset.from_pandas(df_train)
test_dataset=Dataset.from_pandas(df_test)

In [None]:
from transformers import AutoTokenizer
model_name='distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples[text_column_name],truncation=True)

In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test=test_dataset.map(preprocess_function,batched=True)

### PyTorch model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=number_of_labels)

In [None]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
metric= evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits,labels=eval_pred
  predictions=np.argmax(logits,axis=-1)
  return metric.compute(predictions=predictions,references=labels)

In [None]:
#https://huggingface.co/docs/transformers/v4.34.0/en/main_classes/callback#transformers.integrations.MLflowCallback

os.environ["MLFLOW_EXPERIMENT_NAME"]="text-classification"
os.environ["MLFLOW_FLATTEN_PARAMS"]="1"

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Create a text input widget
model_name_widget = widgets.Text(
    value='',
    placeholder='Enter model name',
    description='Model Name:',
    disabled=False
)

# Create a button for confirmation
confirm_button = widgets.Button(
    description='Confirm',
    disabled=False,
    button_style='',  # 'success', 'info', 'warning', 'danger', or ''
    tooltip='Click to confirm the model name',
    icon='check'  # FontAwesome icon name without `fa-`
)

# Function to handle the input once the button is clicked
def on_button_click(button):
    global hf_model_name
    hf_model_name = model_name_widget.value
    print(f"Saved model name: {hf_model_name}")

# Attach the button click event to the function
confirm_button.on_click(on_button_click)

# Display the widget and button
display(model_name_widget, confirm_button)


In [None]:
print("My model name is {}".format(hf_model_name))

In [None]:
training_args = TrainingArguments(
    hub_model_id=hf_model_name,
    output_dir="./output",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_test
)

In [None]:
### Fine tune model
trainer.train()

In [None]:
## Mlflow ending run
mlflow.end_run()     

In [None]:
### Upload to huggingface

trainer.push_to_hub()

In [None]:
%%sh
cp -r mlruns output
cd output
git add mlruns
git commit -m 'Add MLFlow run'
git push