In [1]:
# ==============================
# NOTEBOOK 2: MODEL SELECTION
# ==============================

# CELL 1: Install & imports
!pip install -q transformers sentence-transformers datasets

import torch
import numpy as np
import time
import json

from transformers import pipeline
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from torch.nn.functional import cosine_similarity

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [2]:
# CELL 2: Load dataset & add numeric labels

print("\n=== STEP 1: Loading dataset from Hugging Face ===")

dataset = load_dataset("cnamuangtoun/resume-job-description-fit")
print(dataset)

resume_col = "resume_text"
jd_col = "job_description_text"
label_col = "label"

# Label normalisation logic
label_variations = {
    'no fit': ['no fit','no_fit','no-fit','0','no','not fit','unfit'],
    'potential fit': ['potential fit','potential_fit','potential-fit','1','potential','maybe','partial'],
    'good fit': ['good fit','good_fit','good-fit','2','good','excellent','perfect','best'],
}

def normalize_label(raw):
    s = str(raw).lower().strip()
    if any(v in s for v in label_variations['no fit']):
        return "No Fit"
    if any(v in s for v in label_variations['potential fit']):
        return "Potential Fit"
    if any(v in s for v in label_variations['good fit']):
        return "Good Fit"
    return "No Fit"  # safe fallback

numeric_mapping = {
    "No Fit": 0,
    "Potential Fit": 1,
    "Good Fit": 2
}
reverse_mapping = {v: k for k, v in numeric_mapping.items()}

label_id2name = {
    0: "No Fit",
    1: "Potential Fit",
    2: "Good Fit"
}

def add_numeric_label(example):
    norm = normalize_label(example[label_col])
    example["numeric_label"] = numeric_mapping[norm]
    return example

dataset = dataset.map(add_numeric_label)

print("\nDataset with numeric_label added:")
print(dataset)

train_split = dataset["train"]
test_split = dataset["test"]

print(f"Train size: {len(train_split)}, Test size: {len(test_split)}")


=== STEP 1: Loading dataset from Hugging Face ===


train.csv:   0%|          | 0.00/53.4M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/15.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1759 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['resume_text', 'job_description_text', 'label'],
        num_rows: 6241
    })
    test: Dataset({
        features: ['resume_text', 'job_description_text', 'label'],
        num_rows: 1759
    })
})


Map:   0%|          | 0/6241 [00:00<?, ? examples/s]

Map:   0%|          | 0/1759 [00:00<?, ? examples/s]


Dataset with numeric_label added:
DatasetDict({
    train: Dataset({
        features: ['resume_text', 'job_description_text', 'label', 'numeric_label'],
        num_rows: 6241
    })
    test: Dataset({
        features: ['resume_text', 'job_description_text', 'label', 'numeric_label'],
        num_rows: 1759
    })
})
Train size: 6241, Test size: 1759


In [3]:
# CELL 3: Summarisation model selection

print("\n=== STEP 2: Summarisation Model Selection With Ranking ===")

from transformers import pipeline as hf_pipeline

summarizer_candidates = {
    "t5_base": "t5-base",
    "distilbart": "sshleifer/distilbart-cnn-12-6",
}

# Take a small sample (3 JDs + 3 resumes)
sample_texts = []
for i in range(3):
    sample_texts.append(train_split[i][jd_col])
    sample_texts.append(train_split[i][resume_col])

def keyword_score(original, summary):
    orig_tokens = set([w.lower() for w in original.split() if len(w) > 5])
    sum_tokens  = set([w.lower() for w in summary.split() if len(w) > 5])
    if not orig_tokens:
        return 0.0
    return len(orig_tokens.intersection(sum_tokens)) / len(orig_tokens)

summ_raw_results = []

print("\nRunning evaluation on candidate summarisation models...\n")

for name, model_name in summarizer_candidates.items():
    print(f"--- Testing {name} ({model_name}) ---")
    summ = hf_pipeline(
        "summarization",
        model=model_name,
        device=0 if torch.cuda.is_available() else -1
    )

    comp_ratios, speeds, kw_scores = [], [], []

    for text in sample_texts:
        t = text[:2000]  # safety truncation
        start = time.time()
        summary = summ(
            t,
            max_length=150,
            min_length=40,
            do_sample=False
        )[0]["summary_text"]
        elapsed = time.time() - start

        comp = len(summary) / max(len(t), 1)
        kw   = keyword_score(t, summary)

        comp_ratios.append(comp)
        speeds.append(elapsed)
        kw_scores.append(kw)

    summ_raw_results.append({
        "name": name,
        "model": model_name,
        "avg_compression_ratio": float(np.mean(comp_ratios)),
        "avg_keyword_score": float(np.mean(kw_scores)),
                "avg_inference_time": float(np.mean(speeds)),
    })

# Add BART-large as an excluded baseline (for report, not actually loaded)
summ_raw_results.append({
    "name": "bart_large (excluded)",
    "model": "facebook/bart-large-cnn",
    "avg_compression_ratio": None,
    "avg_keyword_score": None,
    "avg_inference_time": 20.0,
    "note": "Excluded from experiments due to OOM and >20s inference in Colab."
})

# Compute composite score for real candidates
valid = [r for r in summ_raw_results if r["avg_keyword_score"] is not None]

max_kw = max(r["avg_keyword_score"] for r in valid)
min_time = min(r["avg_inference_time"] for r in valid)
max_comp = max(r["avg_compression_ratio"] for r in valid)

for r in valid:
    r["keyword_norm"] = r["avg_keyword_score"] / max_kw if max_kw > 0 else 0.0
    r["speed_norm"] = min_time / r["avg_inference_time"] if r["avg_inference_time"] > 0 else 0.0
    r["compression_norm"] = r["avg_compression_ratio"] / max_comp if max_comp > 0 else 0.0

    r["final_score"] = (
        0.4 * r["keyword_norm"] +
        0.3 * r["compression_norm"] +
        0.3 * r["speed_norm"]
    )

summ_ranked = sorted(valid, key=lambda x: x["final_score"], reverse=True)

print("\n🏆 Summarisation Model Ranking")
print("------------------------------------------------------------")
print(f"{'Rank':<5} {'Model':<30} {'Score':<10} {'Time(s)':<10}")
print("------------------------------------------------------------")
for i, r in enumerate(summ_ranked, 1):
    print(f"{i:<5} {r['model']:<30} {r['final_score']:.4f}   {r['avg_inference_time']:.2f}")

print("\nNote: 'facebook/bart-large-cnn' is excluded due to memory/time constraints.")

BEST_SUMMARIZER = summ_ranked[0]
FINAL_SUMMARIZER_MODEL = BEST_SUMMARIZER["model"]

print("\nSelected summarisation model:")
print("  Name  :", BEST_SUMMARIZER["name"])
print("  Model :", FINAL_SUMMARIZER_MODEL)
print("  Score :", f"{BEST_SUMMARIZER['final_score']:.4f}")


=== STEP 2: Summarisation Model Selection With Ranking ===

Running evaluation on candidate summarisation models...

--- Testing t5_base (t5-base) ---


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_cla

--- Testing distilbart (sshleifer/distilbart-cnn-12-6) ---


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu



🏆 Summarisation Model Ranking
------------------------------------------------------------
Rank  Model                          Score      Time(s)   
------------------------------------------------------------
1     sshleifer/distilbart-cnn-12-6  0.9823   11.06
2     t5-base                        0.9183   10.40

Note: 'facebook/bart-large-cnn' is excluded due to memory/time constraints.

Selected summarisation model:
  Name  : distilbart
  Model : sshleifer/distilbart-cnn-12-6
  Score : 0.9823


In [4]:
# CELL 4: Semantic similarity model selection

print("\n=== STEP 3: Semantic Similarity Model Selection ===")

embedding_candidates = {
    "minilm": "sentence-transformers/all-MiniLM-L6-v2",
    "mpnet": "sentence-transformers/all-mpnet-base-v2",
    "multiqa": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
}

# Build small Good Fit / No Fit sets from train
good_fit_pairs = []
no_fit_pairs = []

for ex in train_split:
    label = ex["numeric_label"]
    if label == 2 and len(good_fit_pairs) < 15:
        good_fit_pairs.append((ex[jd_col], ex[resume_col]))
    if label == 0 and len(no_fit_pairs) < 15:
        no_fit_pairs.append((ex[jd_col], ex[resume_col]))
    if len(good_fit_pairs) >= 15 and len(no_fit_pairs) >= 15:
        break

print("Good Fit pairs:", len(good_fit_pairs))
print("No Fit pairs  :", len(no_fit_pairs))

sim_raw_results = []

for name, model_name in embedding_candidates.items():
    print(f"\n--- Testing embedding model: {name} ({model_name}) ---")
    model = SentenceTransformer(model_name)

    sims_good, sims_no = [], []
    start = time.time()

    for jd_text, res_text in good_fit_pairs:
        emb_jd = model.encode(jd_text, convert_to_tensor=True)
        emb_res = model.encode(res_text, convert_to_tensor=True)
        sims_good.append(cosine_similarity(emb_jd, emb_res, dim=0).item())

    for jd_text, res_text in no_fit_pairs:
        emb_jd = model.encode(jd_text, convert_to_tensor=True)
        emb_res = model.encode(res_text, convert_to_tensor=True)
        sims_no.append(cosine_similarity(emb_jd, emb_res, dim=0).item())

    elapsed = time.time() - start

    avg_good = float(np.mean(sims_good))
    avg_no = float(np.mean(sims_no))
    gap = avg_good - avg_no

    sim_raw_results.append({
        "name": name,
        "model": model_name,
        "avg_good": avg_good,
        "avg_no": avg_no,
        "gap": gap,
        "time_sec": elapsed,
    })

print("\nRaw similarity evaluation results:")
for r in sim_raw_results:
    print(r)

# Composite score
valid_sim = sim_raw_results
max_gap = max(r["gap"] for r in valid_sim)
min_time = min(r["time_sec"] for r in valid_sim)

for r in valid_sim:
    r["gap_norm"] = r["gap"] / max_gap if max_gap > 0 else 0.0
    r["speed_norm"] = min_time / r["time_sec"] if r["time_sec"] > 0 else 0.0
    r["final_score"] = 0.7 * r["gap_norm"] + 0.3 * r["speed_norm"]

sim_ranked = sorted(valid_sim, key=lambda x: x["final_score"], reverse=True)

print("\n🏆 Similarity Model Ranking (Composite Score)")
print("------------------------------------------------------------")
print(f"{'Rank':<5} {'Model':<40} {'Score':<10} {'Gap':<10} {'Time(s)':<10}")
print("------------------------------------------------------------")
for i, r in enumerate(sim_ranked, 1):
    print(f"{i:<5} {r['model']:<40} {r['final_score']:.4f}   {r['gap']:.4f}   {r['time_sec']:.2f}")

BEST_SIM = sim_ranked[0]
FINAL_EMBEDDING_MODEL = BEST_SIM["model"]

print("\nSelected similarity model:")
print("  Name :", BEST_SIM["name"])
print("  Model:", FINAL_EMBEDDING_MODEL)
print("  Score:", f"{BEST_SIM['final_score']:.4f}")


=== STEP 3: Semantic Similarity Model Selection ===
Good Fit pairs: 15
No Fit pairs  : 15

--- Testing embedding model: minilm (sentence-transformers/all-MiniLM-L6-v2) ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


--- Testing embedding model: mpnet (sentence-transformers/all-mpnet-base-v2) ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


--- Testing embedding model: multiqa (sentence-transformers/multi-qa-MiniLM-L6-cos-v1) ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Raw similarity evaluation results:
{'name': 'minilm', 'model': 'sentence-transformers/all-MiniLM-L6-v2', 'avg_good': 0.47122112711270653, 'avg_no': 0.33195153176784514, 'gap': 0.13926959534486139, 'time_sec': 7.851105690002441}
{'name': 'mpnet', 'model': 'sentence-transformers/all-mpnet-base-v2', 'avg_good': 0.5821873227755229, 'avg_no': 0.4211725036303202, 'gap': 0.16101481914520266, 'time_sec': 73.45718479156494}
{'name': 'multiqa', 'model': 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1', 'avg_good': 0.5364052474498748, 'avg_no': 0.3831758052110672, 'gap': 0.15322944223880763, 'time_sec': 14.880459070205688}

🏆 Similarity Model Ranking (Composite Score)
------------------------------------------------------------
Rank  Model                                    Score      Gap        Time(s)   
------------------------------------------------------------
1     sentence-transformers/all-MiniLM-L6-v2   0.9055   0.1393   7.85
2     sentence-transformers/multi-qa-MiniLM-L6-cos-v1 0.8244

In [5]:
# CELL 5: NER model selection

print("\n=== STEP 4: NER Model Selection ===")

from transformers import pipeline as hf_pipeline_ner

ner_candidates = {
    "bert_ner": "dslim/bert-base-NER",
    "xlm_ner": "Davlan/xlm-roberta-base-ner-hrl",
    "multi_ner": "Babelscape/wikineural-multilingual-ner",
}

sample_resumes = [x[resume_col] for x in train_split.select(range(min(5, len(train_split))))]

ner_raw_results = []

for name, model_name in ner_candidates.items():
    print(f"\n--- Testing NER model: {name} ({model_name}) ---")
    try:
        ner_pipe = hf_pipeline_ner("ner", model=model_name, grouped_entities=True)
    except Exception as e:
        print(f"  [SKIP] Failed to load {model_name}: {e}")
        continue

    total_org, total_entities, total_time = 0, 0, 0.0

    for res in sample_resumes:
        text = res[:1000]
        start = time.time()
        ents = ner_pipe(text)
        elapsed = time.time() - start
        total_time += elapsed

        total_entities += len(ents)
        total_org += sum(1 for e in ents if e.get("entity_group") == "ORG")

    avg_org = total_org / len(sample_resumes)
    avg_ents = total_entities / len(sample_resumes)
    avg_time = total_time / len(sample_resumes)

    ner_raw_results.append({
        "name": name,
        "model": model_name,
        "avg_org": avg_org,
        "avg_entities": avg_ents,
        "avg_time": avg_time,
    })
    print(f"  Avg ORG entities: {avg_org:.2f}, Avg entities: {avg_ents:.2f}, Avg time: {avg_time:.2f}s")

if not ner_raw_results:
    raise RuntimeError("All NER models failed to load.")

valid_ner = ner_raw_results
max_org = max(r["avg_org"] for r in valid_ner)
min_time_ner = min(r["avg_time"] for r in valid_ner)

for r in valid_ner:
    r["org_norm"] = r["avg_org"] / max_org if max_org > 0 else 0.0
    r["speed_norm"] = min_time_ner / r["avg_time"] if r["avg_time"] > 0 else 0.0
    r["final_score"] = 0.7 * r["org_norm"] + 0.3 * r["speed_norm"]

ner_ranked = sorted(valid_ner, key=lambda x: x["final_score"], reverse=True)

print("\n🏆 NER Model Ranking (Composite Score)")
print("------------------------------------------------------------")
print(f"{'Rank':<5} {'Model':<40} {'Score':<10} {'ORG':<10} {'Time(s)':<10}")
print("------------------------------------------------------------")
for i, r in enumerate(ner_ranked, 1):
    print(f"{i:<5} {r['model']:<40} {r['final_score']:.4f}   {r['avg_org']:.2f}   {r['avg_time']:.2f}")

BEST_NER = ner_ranked[0]
FINAL_NER_MODEL = BEST_NER["model"]

print("\nSelected NER model:")
print("  Name :", BEST_NER["name"])
print("  Model:", FINAL_NER_MODEL)
print("  Score:", f"{BEST_NER['final_score']:.4f}")


=== STEP 4: NER Model Selection ===

--- Testing NER model: bert_ner (dslim/bert-base-NER) ---


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


  Avg ORG entities: 3.00, Avg entities: 7.20, Avg time: 0.75s

--- Testing NER model: xlm_ner (Davlan/xlm-roberta-base-ner-hrl) ---


config.json:   0%|          | 0.00/980 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cpu


  Avg ORG entities: 1.80, Avg entities: 5.40, Avg time: 0.67s

--- Testing NER model: multi_ner (Babelscape/wikineural-multilingual-ner) ---


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


  Avg ORG entities: 3.60, Avg entities: 14.60, Avg time: 0.85s

🏆 NER Model Ranking (Composite Score)
------------------------------------------------------------
Rank  Model                                    Score      ORG        Time(s)   
------------------------------------------------------------
1     Babelscape/wikineural-multilingual-ner   0.9346   3.60   0.85
2     dslim/bert-base-NER                      0.8483   3.00   0.75
3     Davlan/xlm-roberta-base-ner-hrl          0.6500   1.80   0.67

Selected NER model:
  Name : multi_ner
  Model: Babelscape/wikineural-multilingual-ner
  Score: 0.9346


In [6]:
# CELL 6: Save matchai_config.json

print("\n=== STEP 5: Saving MatchAI configuration ===")

# 🔴 IMPORTANT: Hugging face fine-tuned model ID
FINE_TUNED_MODEL_ID = "distilbert-base-uncased-finetuned-sst-2-english"

WEIGHTS = {
    "classifier": 0.5,
    "similarity": 0.3,
    "keywords": 0.2,
}

matchai_config = {
    "fine_tuned_model_id": FINE_TUNED_MODEL_ID,
    "summarization_model": FINAL_SUMMARIZER_MODEL,
    "embedding_model": FINAL_EMBEDDING_MODEL,
    "ner_model": FINAL_NER_MODEL,
    "weights": WEIGHTS,
    "label_id2name": label_id2name,
}

with open("matchai_config.json", "w") as f:
    json.dump(matchai_config, f, indent=2)

print("Saved matchai_config.json with model choices and weights:")
print(json.dumps(matchai_config, indent=2))


=== STEP 5: Saving MatchAI configuration ===
Saved matchai_config.json with model choices and weights:
{
  "fine_tuned_model_id": "distilbert-base-uncased-finetuned-sst-2-english",
  "summarization_model": "sshleifer/distilbart-cnn-12-6",
  "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
  "ner_model": "Babelscape/wikineural-multilingual-ner",
  "weights": {
    "classifier": 0.5,
    "similarity": 0.3,
    "keywords": 0.2
  },
  "label_id2name": {
    "0": "No Fit",
    "1": "Potential Fit",
    "2": "Good Fit"
  }
}


In [7]:
!ls

matchai_config.json  sample_data
