In [None]:
!pip uninstall -y huggingface_hub transformers
!pip install -U setfit==1.1.0 transformers==4.44.0

----------

# Intent Detection | Text Classification

----------


In [1]:
from datasets import DatasetDict, Dataset
from setfit import SetFitModel, Trainer, TrainingArguments
import random
import shutil
import pandas as pd

save_to = "/content/football_sft_model"

## Dataset

In [None]:
full_data = {
    'greet-hi': [
                            'Hi',
                            'Hoooo',
                            'Hello buddy',
                            'Salut',
                            'Hey',
                            'Good morning',
                            'Bonsoir',
                            'Salut mon ami'
                        ],
    'greet-who_are_you': [
                        'qui êtes-vous',
                        'who are you',
                        'how can you help me',
                        "comment pouvez vous m'aider",
                        'why do I need you',
                        "pourquoi j'ai besoin de toi",
                        'help please',
                        "Aidez-moi, s'il vous plaît",
                        'I can not understand you',
                        'je ne te comprends pas'
                    ],
    'greet-good_bye': ['good bye',
                    'au revoir',
                    'See you later',
                    'A plus tard',
                    'stop',
                    'arrêter',
                    'exit',
                    'sortir'],
    'matches-team_next_match': [
                        'when will be the matches of Liverpool',
                        'quand auront lieu les matchs de Liverpool',
                        'when will be the matches of Aston Villa',
                        'quand auront lieu les matchs de Aston Villa',
                        'man city matches',
                        'Arsenal matches',
                        'Arsenal matchs',
                        'man city matchs',
                        'what the calendar of chelsea matches',
                        'à quoi correspond le calendrier de chelsea',
                        'Will Liverpool play any matches soon',
                        'Est-ce que Liverpool jouera bientôt des matchs',
                        'Will Liverpool play any matches next days',
                        'Est-ce que Liverpool jouera des matchs les jours prochains',
                        'upcoming games for Manchester United',
                        'prochains matchs de Tottenham Hotspur',
                        'Barcelona match schedule',
                        'Real Madrid fixtures',
                        'when are the matches of Bayern Munich',
                        'calendrier des matchs pour Paris Saint-Germain',
                        'any upcoming fixtures for Juventus',
                        'próximos partidos de Atletico Madrid',
                        'which teams will AC Milan face next',
                        'quels sont les prochains matchs de Borussia Dortmund'
                    ],
    'matches-match_time': [
                            'when liverpool will play with man city',
                            'quand liverpool jouera avec man city',
                            'crystal place vs chelsea',
                            'Bournemouth vs Brentford',
                            'Bournemouth vs Brentford',
                            'Norwich City vs Newcastle United',
                            'Southampton vs West Ham United',
                            'tell me the time of Everton vs Leicester City',
                            'tell me the time of Wolverhampton vs Chelsea',
                            'will Manchester United play with chelsea',
                            'will Brentford play with Bournemouth',
                            'Dis-moi le temps de Everton contre Leicester City',
                            'dis-moi le temps de Wolverhampton vs Chelsea',
                            'est-ce que Manchester United jouera avec chelsea',
                            'est-ce que Brentford jouera avec Bournemouth'
                        ],
    'matches-match_result': [
                            'what is the score of Brentford match',
                            'what is the score of Wolverhampton match',
                            'score of Everton match',
                            'who won in Norwich City vs West Ham United match',
                            'who won in Bournemouth vs West Ham United match',
                            'who won in Liverpool vs Newcastle United match',
                            'did Liverpool defeted man city',
                            'Liverpool and West Ham result',
                            'Chelsea and Norwich final result',
                            'quel est le score du match Brentford',
                            'quel est le score du match Wolverhampton',
                            'score du match Everton',
                            'qui a gagné le match Norwich City contre Norwich City',
                            'qui a gagné le match Bournemouth contre West Ham United',
                            'qui a gagné le match Liverpool contre Newcastle United',
                            'est-ce que Liverpool a vaincu man city',
                            'Résultat de Liverpool et West Ham',
                            'Résultat final de Chelsea et Norwich'
                        ]
}

In [3]:
## ===============================
## format the dataset
## ===============================

train_data, dev_data = [], []
all_labels = list(full_data.keys())
idx = 0

for category, sentences in full_data.items():

    random.Random(42).shuffle(sentences)
    label_id = all_labels.index(category)

    train_sentences = sentences[:-3]
    dev_sentences = sentences[-3:]

    for sentence in train_sentences:
        train_data.append({
            'idx': idx,
            'sentence': sentence,
            'label': label_id
        })

        idx += 1

    for sentence in dev_sentences:
        dev_data.append({
            'idx': idx,
            'sentence': sentence,
            'label': label_id
        })

        idx += 1

# shuffle the data
random.Random(42).shuffle(train_data)
random.Random(42).shuffle(dev_data)

# create the full_dataset dictionary
full_dataset = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(train_data)),
    'dev': Dataset.from_pandas(pd.DataFrame(dev_data))
})

In [4]:
full_dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 65
    })
    dev: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 18
    })
})

In [5]:
full_dataset["dev"][6]

{'idx': 81, 'sentence': 'what is the score of Brentford match', 'label': 5}

In [7]:
print("Labels")
all_labels

Labels


['greet-hi',
 'greet-who_are_you',
 'greet-good_bye',
 'matches-team_next_match',
 'matches-match_time',
 'matches-match_result']

## Train the Model

In [None]:
## ===============================
## Load a SetFit model from Hub
## ===============================

base_model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    labels = all_labels,
)

In [None]:
args = TrainingArguments(
    batch_size=4,
    num_epochs=4,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=base_model,
    args=args,
    train_dataset=full_dataset["train"],
    eval_dataset=full_dataset["dev"],
    metric="accuracy",
    column_mapping={"sentence": "text", "label": "label"}
)

## ===============================
## Train and Evaluate
## ===============================
trainer.train()

In [None]:
metrics = trainer.evaluate(full_dataset["dev"])
print(metrics)

Applying column mapping to the evaluation dataset
***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 1.0}


In [None]:
## ===============================
## save to local
## ===============================

trainer.model.save_pretrained(save_to)

## ===============================
## zip the save_to dire
## ===============================

shutil.make_archive(save_to, 'zip', save_to)

'/content/football_sft_model.zip'

In [None]:
trainer.model.push_to_hub(repo_id="botpress_football_sft_model",
                          private=True,
                          token="your_token_here")

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

model_head.pkl:   0%|          | 0.00/19.4k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/bakrianoo/botpress_football_sft_model/commit/a46bd28986737c930c3f6dc7b5d7f0aad691a4da', commit_message='Push model using huggingface_hub.', commit_description='', oid='a46bd28986737c930c3f6dc7b5d7f0aad691a4da', pr_url=None, pr_revision=None, pr_num=None)

### Model Inference

In [None]:
# load the finetunned model
sft_model = SetFitModel.from_pretrained("bakrianoo/botpress_football_sft_model", token="your_token")

In [None]:
preds = sft_model.predict("Hi everybody")

print(preds)

greet-hi


In [None]:
preds = sft_model.predict_proba("Hi everybody",).tolist()

print(preds)

[0.9447020455963554, 0.016481113318798188, 0.009387945259545815, 0.009768482763498613, 0.008602172576037749, 0.011058240485764374]


----------

# Named Entity Recognition | Entity Detection

----------


In [None]:
!pip install gliner==0.2.7

In [None]:
from gliner import GLiNER
import pprint

# multilignual
ner_model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")

# only english - small
# ner_model = GLiNER.from_pretrained("urchade/gliner_small-v2.1")

# only english - large
# ner_model = GLiNER.from_pretrained("urchade/gliner_large-v2.1")

In [None]:
samples = [
    "who won in Norwich City vs West Ham United match",
     "من الذي فاز في مباراة الاهلي و الزمالك ؟",
]

labels = ["team_name"]

for text in samples:
    entities = ner_model.predict_entities(text, labels)
    pprint.pprint(entities)
    print()


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'end': 23,
  'label': 'team_name',
  'score': 0.8178487420082092,
  'start': 11,
  'text': 'Norwich City'},
 {'end': 42,
  'label': 'team_name',
  'score': 0.7780051827430725,
  'start': 27,
  'text': 'West Ham United'}]

[{'end': 28,
  'label': 'team_name',
  'score': 0.597097635269165,
  'start': 22,
  'text': 'الاهلي'},
 {'end': 38,
  'label': 'team_name',
  'score': 0.5380057692527771,
  'start': 31,
  'text': 'الزمالك'}]



In [None]:
text = """

Germany coach Julian Nagelsmann made two changes to his lineup with Jonathan Tah and Emre Can coming in to the team to face an unchanged Spain in their Euro 2024 quarter-final clash on Friday.

""".strip()

labels = ["team_name", "person_name", "event"]

entities = ner_model.predict_entities(text, labels)
pprint.pprint(entities)

[{'end': 31,
  'label': 'person_name',
  'score': 0.8375915884971619,
  'start': 14,
  'text': 'Julian Nagelsmann'},
 {'end': 80,
  'label': 'person_name',
  'score': 0.9372616410255432,
  'start': 68,
  'text': 'Jonathan Tah'},
 {'end': 93,
  'label': 'person_name',
  'score': 0.9378184676170349,
  'start': 85,
  'text': 'Emre Can'},
 {'end': 161,
  'label': 'event',
  'score': 0.8798717856407166,
  'start': 152,
  'text': 'Euro 2024'}]
