# Try out gradio

**Load prerequisites**

In [2]:
# Prerequisites
from tabulate import tabulate
from transformers import pipeline
import json
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os
import time
import math

# get candidate labels
with open("packing_label_structure.json", "r") as file:
    candidate_labels = json.load(file)
keys_list = list(candidate_labels.keys())

# Load test data (in list of dictionaries)
with open("test_data.json", "r") as file:
    packing_data = json.load(file)
# Extract all trip descriptions and trip_types
trip_descriptions = [trip['description'] for trip in packing_data]
trip_types = [trip['trip_types'] for trip in packing_data]

# Access the first trip description
first_trip = trip_descriptions[0]
# Get the packing list for the secondfirst trip
first_trip_type = trip_types[0]

print(f"First trip: {first_trip} \n")
print(f"Trip type: {first_trip_type}")

First trip: I am planning a trip to Greece with my boyfriend, where we will visit two islands. We have booked an apartment on each island for a few days and plan to spend most of our time relaxing. Our main goals are to enjoy the beach, try delicious local food, and possibly go on a hike—if it’s not too hot. We will be relying solely on public transport. We’re in our late 20s and traveling from the Netherlands. 

Trip type: ['beach vacation', ['swimming', 'going to the beach', 'relaxing', 'hiking'], 'warm destination / summer', 'lightweight (but comfortable)', 'casual', 'indoor', 'no own vehicle', 'no special conditions', '7+ days']


In [3]:
# function that returns pandas data frame with predictions

cut_off = 0.5  # used to choose which activities are relevant

def pred_trip(model_name, trip_descr, trip_type, cut_off):
    classifier = pipeline("zero-shot-classification", model=model_name)
    # Create an empty DataFrame with specified columns
    df = pd.DataFrame(columns=['superclass', 'pred_class'])
    for i, key in enumerate(keys_list):
        if key == 'activities':
            result = classifier(trip_descr, candidate_labels[key], multi_label=True)
            indices = [i for i, score in enumerate(result['scores']) if score > cut_off]
            classes = [result['labels'][i] for i in indices]
        else:
            result = classifier(trip_descr, candidate_labels[key])
            classes = result["labels"][0]
        print(result)
        print(classes)
        print(i)
        df.loc[i] = [key, classes]
    df['true_class'] = trip_type
    return df

In [4]:
# function for accuracy, perc true classes identified and perc wrong pred classes

def perf_measure(df):
    df['same_value'] = df['pred_class'] == df['true_class']
    correct = sum(df.loc[df.index != 1, 'same_value'])
    total = len(df['same_value'])
    accuracy = correct/total
    pred_class = df.loc[df.index == 1, 'pred_class'].iloc[0]
    true_class = df.loc[df.index == 1, 'true_class'].iloc[0]
    correct = [label for label in pred_class if label in true_class]
    num_correct = len(correct)
    correct_perc = num_correct/len(true_class)
    num_pred = len(pred_class)
    if num_pred == 0:
        wrong_perc = math.nan
    else:
        wrong_perc = (num_pred - num_correct)/num_pred
    df_perf = pd.DataFrame({
    'accuracy': [accuracy],
    'true_ident': [correct_perc],
    'false_pred': [wrong_perc]
    })
    return(df_perf)

Provide a list of candidate models and apply them to the test data

In [5]:
# List of Hugging Face model names
model_names = [
    #"facebook/bart-large-mnli",
    #"MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
    #"cross-encoder/nli-deberta-v3-base",
    #"cross-encoder/nli-deberta-v3-large",
    #"MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
    #"joeddav/bart-large-mnli-yahoo-answers",
    #"MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli",
    #"MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
    #"valhalla/distilbart-mnli-12-1",
    #"joeddav/xlm-roberta-large-xnli" # keeps giving errors
]

# Apply each model to the test data
for model_name in model_names:
    print(f"\nUsing model: {model_name}")
    result_list = []
    performance = pd.DataFrame(columns=['accuracy', 'true_ident', 'false_pred'])
    
    start_time = time.time()
    for i in range(len(trip_descriptions)):
        current_trip = trip_descriptions[i]
        current_type = trip_types[i]
        df = pred_trip(model_name, current_trip, current_type, cut_off = 0.5)
        print(df)
        # accuracy, perc true classes identified and perc wrong pred classes
        performance = pd.concat([performance, perf_measure(df)])
        print(performance)
        
        result_list.append(df)
    end_time = time.time()
    elapsed_time = end_time - start_time
    # Extract "same_value" column from each DataFrame
    sv_columns = [df['same_value'] for df in result_list]  # 'same' needs to be changed
    sv_columns.insert(0, result_list[0]['superclass'])
    # Combine into a new DataFrame (columns side-by-side)
    sv_df = pd.concat(sv_columns, axis=1)
    print(sv_df)
    # Compute accuracy per superclass (row means of same_value matrix excluding the first column)
    row_means = sv_df.iloc[:, 1:].mean(axis=1)
    df_row_means = pd.DataFrame({
        'superclass': sv_df['superclass'],
        'accuracy': row_means
    })
    print(df_row_means)
    # Compute performance measures per trip (mean for each column of performance table)
    column_means = performance.mean()
    print(column_means)
    # save results
    model = model_name.replace("/", "-")
    model_result = {
        'model': model,
        'predictions': result_list,
        'performance': performance,
        'perf_summary': column_means,
        'perf_superclass': df_row_means,
        'elapsed_time': elapsed_time
    }
    # File path with folder
    filename = os.path.join('results', f'{model}_results.pkl')
    # Save the object
    with open(filename, 'wb') as f:
        pickle.dump(model_result, f)






Using model: joeddav/xlm-roberta-large-xnli


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']

**Load and compare results**

In [5]:
# Folder where .pkl files are saved
results_dir = 'results'

# Dictionary to store all loaded results
all_results = {}

# Loop through all .pkl files in the folder
for filename in os.listdir(results_dir):
    if filename.endswith('.pkl'):
        model_name = filename.replace('_results.pkl', '')  # Extract model name
        file_path = os.path.join(results_dir, filename)
        
        # Load the result
        with open(file_path, 'rb') as f:
            result = pickle.load(f)
            all_results[model_name] = result

# Compare performance across models
for model, data in all_results.items():
    print(f"Model: {model}")
    print(f"Performance Summary:\n{data['perf_summary']}")
    print("-" * 40)

# Compare performance across models
for model, data in all_results.items():
    print(f"Model: {model}")
    print(f"Performance Summary:\n{data['perf_superclass']}")
    print("-" * 40)

Model: cross-encoder-nli-deberta-v3-base
Performance Summary:
accuracy      0.444444
true_ident    0.533333
false_pred    0.712500
dtype: float64
----------------------------------------
Model: joeddav-bart-large-mnli-yahoo-answers
Performance Summary:
accuracy      0.311111
true_ident    0.650000
false_pred    0.553792
dtype: float64
----------------------------------------
Model: cross-encoder-nli-deberta-v3-large
Performance Summary:
accuracy      0.466667
true_ident    0.566667
false_pred    0.541667
dtype: float64
----------------------------------------
Model: MoritzLaurer-DeBERTa-v3-large-mnli-fever-anli-ling-wanli
Performance Summary:
accuracy      0.566667
true_ident    0.841667
false_pred    0.546667
dtype: float64
----------------------------------------
Model: MoritzLaurer-mDeBERTa-v3-base-mnli-xnli
Performance Summary:
accuracy      0.466667
true_ident    0.408333
false_pred    0.481250
dtype: float64
----------------------------------------
Model: MoritzLaurer-deberta-v3-

**Identify trips that are difficult to predict**

Per model

In [54]:
def get_difficult_trips(model_result, cut_off = 0.6):
    # model_result is a dict with dict_keys(['model', 'predictions', 
    # 'performance', 'perf_summary', 'perf_superclass', 'elapsed_time'])
    # get performance dataframe and repair index
    df = model_result['performance'].reset_index(drop=True)
    # find index of trips whose accuracy is below cut_off
    index_result = df[df['accuracy'] < cut_off].index
    return(index_result)

# dictionary of trips that have accuracy below cut_off default
difficult_trips_dict = {}
for model, data in all_results.items():
    difficult_trips_dict[data["model"]] = get_difficult_trips(data)

for key, value in difficult_trips_dict.items():
    print(f"{key}: {value}\n")

cross-encoder-nli-deberta-v3-base: Index([0, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

joeddav-bart-large-mnli-yahoo-answers: RangeIndex(start=0, stop=10, step=1)

cross-encoder-nli-deberta-v3-large: Index([0, 1, 2, 3, 4, 6, 7, 8, 9], dtype='int64')

MoritzLaurer-DeBERTa-v3-large-mnli-fever-anli-ling-wanli: Index([2, 5, 6, 7, 8, 9], dtype='int64')

MoritzLaurer-mDeBERTa-v3-base-mnli-xnli: RangeIndex(start=0, stop=10, step=1)

MoritzLaurer-deberta-v3-large-zeroshot-v2.0: Index([0, 1, 2, 3, 4, 5, 6, 7, 9], dtype='int64')

facebook-bart-large-mnli: RangeIndex(start=0, stop=10, step=1)

valhalla-distilbart-mnli-12-1: Index([0, 1, 2, 3, 4, 7, 9], dtype='int64')

MoritzLaurer-DeBERTa-v3-base-mnli-fever-anli: Index([0, 2, 3, 4, 6, 7], dtype='int64')

My partner and I are traveling to the Netherlands and Germany to spend Christmas with our family. We are in our late twenties and will start our journey with a two-hour flight to the Netherlands. From there, we will take a 5.5-hour train ride to n

For all models

In [55]:
# Which trips are difficult for all models
common = set.intersection(*(set(v) for v in difficult_trips_dict.values()))
for i in common:
    print(trip_descriptions[i], "\n")

My partner and I are traveling to the Netherlands and Germany to spend Christmas with our family. We are in our late twenties and will start our journey with a two-hour flight to the Netherlands. From there, we will take a 5.5-hour train ride to northern Germany. 

We will go to Sweden in the winter, to go for a yoga and sauna/wellness retreat. I prefer lightweight packing and also want clothes to go for fancy dinners and maybe on a winter hike. We stay in hotels. 



**Identify superclasses that are difficult to predict**

Per model

In [66]:
def get_difficult_superclasses(model_result, cut_off = 0.5):
    # model_result is a dict with dict_keys(['model', 'predictions', 
    # 'performance', 'perf_summary', 'perf_superclass', 'elapsed_time'])
    df = model_result["perf_superclass"]
    # find superclass whose accuracy is below cut_off
    diff_spc = list(df[df['accuracy'] < cut_off]["superclass"])
    return(diff_spc)

# dictionary of superclasses that have accuracy below cut_off default
difficult_superclass_dict = {}
for model, data in all_results.items():
    difficult_superclass_dict[data["model"]] = get_difficult_superclasses(data)

for key, value in difficult_superclass_dict.items():
    print(f"{key}: {value}\n")

           superclass  accuracy
0       activity_type       0.7
1          activities       0.0
2   climate_or_season       0.4
3    style_or_comfort       0.4
4          dress_code       0.7
5       accommodation       0.6
6      transportation       0.6
7  special_conditions       0.0
8    trip_length_days       0.6
           superclass  accuracy
0       activity_type       0.7
1          activities       0.1
2   climate_or_season       0.5
3    style_or_comfort       0.3
4          dress_code       0.1
5       accommodation       0.3
6      transportation       0.3
7  special_conditions       0.0
8    trip_length_days       0.6
           superclass  accuracy
0       activity_type       0.6
1          activities       0.1
2   climate_or_season       0.5
3    style_or_comfort       0.4
4          dress_code       0.7
5       accommodation       0.7
6      transportation       0.4
7  special_conditions       0.3
8    trip_length_days       0.6
           superclass  accuracy
0       

For all models

In [64]:
# Which trips are difficult for all models
common = set.intersection(*(set(v) for v in difficult_superclass_dict.values()))
print(common)

{'activities', 'special_conditions'}


In [68]:
print(all_results["cross-encoder-nli-deberta-v3-base"])


{'model': 'cross-encoder-nli-deberta-v3-base', 'predictions': [           superclass                                    pred_class  \
0       activity_type                micro-adventure / weekend trip   
1          activities                                            []   
2   climate_or_season            variable weather / spring / autumn   
3    style_or_comfort                                    minimalist   
4          dress_code                                        casual   
5       accommodation                                        indoor   
6      transportation                                no own vehicle   
7  special_conditions  self-supported (bring your own food/cooking)   
8    trip_length_days                                       7+ days   

                                         true_class  same_value  
0                                    beach vacation       False  
1  [swimming, going to the beach, relaxing, hiking]       False  
2                         wa

**Comparing models**

In [None]:
# Make table of 'perf_summary' for all models inlcude time elapsed
# Make ranking from that table for each category


**Use gradio for user input**

In [66]:
# use model with gradio
from transformers import pipeline
import gradio as gr

# make a function for what I am doing
def classify(text):
    df = pd.DataFrame(columns=['Superclass', 'class'])
    for i, key in enumerate(keys_list):
        # Run the classification (ca 30 seconds classifying)
        if key == 'activities':
            result = classifier(text, candidate_labels[key], multi_label=True)
            classes = [result['labels'][i] for i in indices]
        else:
            result = classifier(text, candidate_labels[key])
            classes = result["labels"][0]
        print(i)
        df.loc[i] = [key, classes]

    return df

demo = gr.Interface(
    fn=classify,
    inputs="text",
    outputs="dataframe",
    title="Zero-Shot Classification",
    description="Enter a text describing your trip",
)

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://aa06d5d85ffadaa92b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8


Use model with gradio

In [4]:
# Define the Gradio interface
def classify(text):
    return classifier(text, class_labels)

demo = gr.Interface(
    fn=classify,
    inputs="text",
    outputs="json",
    title="Zero-Shot Classification",
    description="Enter a text describing your trip",
)

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://0f70ba5369d721cf8f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
