## Dependencies

In [None]:
!pip install transformers accelerate sentence-transformers einops bitsandbytes

In [3]:
import os
os.environ['HF_TOKEN'] = "HUGGINGFACEHUB_API_TOKEN"
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "HUGGINGFACEHUB_API_TOKEN"

In [4]:
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v2"


In [None]:
!pip install datasets
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses
from datasets import Dataset  # <-- Add this import
import json

with open("dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

faq_items = data["dataset1"]  # a list of {id, category, question, answer}
questions = [item["question"] for item in faq_items]
answers = [item["answer"] for item in faq_items]


In [None]:
!pip install torch sentence-transformers


## Training

In [7]:
from sentence_transformers import InputExample

train_samples = []
for item in faq_items:
    q = item["question"]
    a = item["answer"]
    train_samples.append(InputExample(texts=[q, a]))



In [8]:
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader

model_name = "sentence-transformers/distiluse-base-multilingual-cased-v2"
model = SentenceTransformer(model_name)


In [9]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=8)


In [10]:
train_loss = losses.MultipleNegativesRankingLoss(model)


In [None]:
!pip install --upgrade sentence-transformers datasets


In [12]:
num_epochs = 6

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=100,  # or suitable number
    show_progress_bar=True
)


                                                                     

Step,Training Loss


In [13]:
model.save("fine_tuned_faq_model")


In [14]:
import torch
from sentence_transformers import SentenceTransformer

# Load your fine-tuned model
model = SentenceTransformer("fine_tuned_faq_model")

# Build question list & answer list
questions = [item["question"] for item in faq_items]
answers = [item["answer"] for item in faq_items]

# Compute embeddings for all questions
question_embeddings = model.encode(questions, convert_to_tensor=True)


In [15]:
def get_best_answer(user_question):

    # 1) Encode the incoming user question
    user_embedding = model.encode([user_question], convert_to_tensor=True)

    # 2) Compute cosine similarities
    cos_scores = torch.nn.functional.cosine_similarity(question_embeddings, user_embedding[0], dim=1)

    # 3) Find the question with the highest similarity
    top_idx = torch.argmax(cos_scores).item()
    best_score = cos_scores[top_idx].item()

    # Apply the threshold
    if best_score < 0.3:
        return "Je suis là pour vous aider. Pouvez-vous reformuler votre question ?", "Je suis là pour vous aider. Pouvez-vous reformuler votre question ?", None

    best_match_question = questions[top_idx]
    best_answer = answers[top_idx]

    return best_match_question, best_answer, best_score


## Test samples

In [16]:
user_query = "Pouvez-vous me dire quelle est la mission de l'établissement ?"
matched_q, matched_a, sim_score = get_best_answer(user_query)

print("User Query:", user_query)
print("Matched FAQ Question:", matched_q)
print("Answer:", matched_a)
print("Similarity Score:", sim_score)


User Query: Pouvez-vous me dire quelle est la mission de l'établissement ?
Matched FAQ Question: Quelle est la mission de notre établissement ?
Answer: Notre mission est de fournir une éducation de qualité, accessible à tous, et de promouvoir l'innovation et la recherche.
Similarity Score: 0.8176456689834595


In [17]:
user_query = "Bonjour, Pouvez-vous me dire quelle est la mission de l'établissement ?"
matched_q, matched_a, sim_score = get_best_answer(user_query)

print("User Query:", user_query)
print("Matched FAQ Question:", matched_q)
print("Answer:", matched_a)
print("Similarity Score:", sim_score)

User Query: Bonjour, Pouvez-vous me dire quelle est la mission de l'établissement ?
Matched FAQ Question: Bonjour
Answer: Bonjour, comment puis-je vous aider ?
Similarity Score: 0.8234753012657166


In [18]:
user_query = "quelles sont les mastères possibles ?"
matched_q, matched_a, sim_score = get_best_answer(user_query)

print("User Query:", user_query)
print("Matched FAQ Question:", matched_q)
print("Answer:", matched_a)
print("Similarity Score:", sim_score)

User Query: quelles sont les mastères possibles ?
Matched FAQ Question: Y a-t-il des opportunités de stages pour les étudiants en master ?
Answer: Oui, la plupart des masters incluent des stages obligatoires ou optionnels.
Similarity Score: 0.34152695536613464


In [19]:
user_query = "salem"
matched_q, matched_a, sim_score = get_best_answer(user_query)

print("User Query:", user_query)
print("Matched FAQ Question:", matched_q)
print("Answer:", matched_a)
print("Similarity Score:", sim_score)

User Query: salem
Matched FAQ Question: Salem
Answer: Salem, comment ça va ?
Similarity Score: 0.8908565044403076


In [20]:
user_query = "Quels sont les événements culturels les plus populaires parmi les étudiants ?"
matched_q, matched_a, sim_score = get_best_answer(user_query)

print("User Query:", user_query)
print("Matched FAQ Question:", matched_q)
print("Answer:", matched_a)
print("Similarity Score:", sim_score)

User Query: Quels sont les événements culturels les plus populaires parmi les étudiants ?
Matched FAQ Question: Quels sont les événements culturels les plus populaires parmi les étudiants ?
Answer: Les événements les plus populaires incluent le festival culturel annuel et les compétitions sportives interuniversitaires.
Similarity Score: 0.9999998807907104
