In [None]:
!pip install transformers langchain chromadb

Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3

**Cleaning and splitting the dataset**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = "/content/dataset.csv"
df = pd.read_csv(file_path)

# Clean and prepare the data
df = df.dropna()
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df = df.drop_duplicates()
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save processed data
train_df.to_csv("/content/train.csv", index=False)
test_df.to_csv("/content/test.csv", index=False)

print("Dataset processed and ready for training.")

Dataset processed and ready for training.


In [None]:
pip install transformers langchain huggingface_hub torch faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.12-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.12 (from langchain_community)
  Downloading langchain-0.3.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain_community)
  Downloading langchain_core-0.3.25-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

**Importaions**

In [None]:
import os
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.agents import initialize_agent, AgentType
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from typing import List, Dict

In [None]:
import pandas as pd
import torch
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.agents import initialize_agent, Tool
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from transformers import AutoModel, AutoTokenizer


**Configuration**

In [None]:

# Configuration dictionary
Config = {
    "MODEL_NAME": "xlm-roberta-base",  # Pretrained multilingual model
    "MODEL_SAVE_PATH": "./fine_tuned_xlm_roberta",  # Path to save the fine-tuned model
    "VECTOR_SAVE_PATH": "./Vectors",  # Path to save vectors
    "SUPPORTED_LANGUAGES": ["fr", "ar"],  # Supported languages: French and Arabic
    "MAX_TOKENS": 512,  # Limit on generated text
    "TEMPERATURE": 0.7,  # Controls the creativity of responses
    "TRAIN_DATASET_PATH": "/content/train.csv",
    "TEST_DATASET_PATH": "/content/test.csv",
    "CONFIG_SAVE_PATH": "./chatbot_config.txt"
}

# Prompt template
PROMPT_TEMPLATE = """Vous êtes un agent de conversation multilingue spécialisé dans les voyages en Grèce.
Vous pouvez répondre à des questions sur les destinations, les transports, les réservations, l'histoire, la culture, et d'autres conseils utiles.
Répondez de manière détaillée et engageante en français ou en arabe selon la langue de la question.
Ajoutez des conseils pratiques et des informations pertinentes si nécessaire.

Langues supportées : Français et Arabe.
Soyez utile, informatif et amical.

Question : {question}
Réponse :"""


**Training**

In [None]:
!pip install datasets
# Install required libraries
!pip install transformers datasets accelerate -q



In [None]:


# Import libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

# Load dataset
file_path = "dataset.csv"  # Replace with your file path
df = pd.read_csv(file_path)

# Data Preprocessing
df = df.dropna().drop_duplicates()  # Clean data
label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['Réponse'])  # Encode labels

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert data to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load tokenizer and model
MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(df['labels'].unique()))

# Tokenization function
def preprocess_function(examples):
    return tokenizer(examples['Question'], truncation=True, padding="max_length", max_length=128)

# Apply tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Mixed precision for efficiency
    logging_dir="./logs",
    logging_steps=10,
    gradient_accumulation_steps=4,  # Simulate larger batch sizes
    save_total_limit=2,
)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Compute metrics for evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_xlm_roberta")
tokenizer.save_pretrained("./fine_tuned_xlm_roberta")

# Evaluate the model
print("Evaluating the model...")
results = trainer.evaluate()
print(results)

# Save label encoder for later use
import pickle
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("Training and evaluation complete. Model and tokenizer saved!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Map:   0%|          | 0/88 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmoussaemna[0m ([33mmoussaemna-istic[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,6.1275,6.135941,0.0,0.0,0.0,0.0
2,6.0832,6.168249,0.0,0.0,0.0,0.0
3,6.0825,6.153408,0.0,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating the model...


{'eval_loss': 6.153407573699951, 'eval_accuracy': 0.0, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 35.6085, 'eval_samples_per_second': 2.471, 'eval_steps_per_second': 0.618, 'epoch': 3.0}
Training and evaluation complete. Model and tokenizer saved!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**LangChain**

In [None]:
# Install required libraries
!pip install pandas scikit-learn langdetect -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [None]:


# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import re

# Ensure consistent language detection
DetectorFactory.seed = 0

# Load the dataset
file_path = "/content/dataset.csv"  # Replace with your file path
df = pd.read_csv(file_path)

# Data Preprocessing
df = df.dropna(subset=['Question', 'Réponse', 'Notes']).reset_index(drop=True)
df = df.drop_duplicates()

# Combine Réponse and Notes columns for similarity matching
df['combined_text'] = df['Réponse'].astype(str) + " " + df['Notes'].astype(str)

# Language-specific stop words for Arabic and French
french_stop_words = ["le", "la", "les", "et", "est", "un", "une", "des", "à", "en", "de", "pour", "que"]
arabic_stop_words = ["و", "في", "من", "إلى", "على", "التي", "هو", "هي", "ما", "هذا", "ذلك", "بها"]

def remove_stopwords(text, lang):
    """Remove stopwords based on detected language."""
    if lang == "fr":
        stop_words = french_stop_words
    elif lang == "ar":
        stop_words = arabic_stop_words
    else:
        return text  # No stopword removal for unsupported languages

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

# Detect language and preprocess combined text
def preprocess_text(text):
    try:
        lang = detect(text)
        cleaned_text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
        return remove_stopwords(cleaned_text, lang)
    except LangDetectException:
        return text  # Return original text if language cannot be detected

df['processed_text'] = df['combined_text'].apply(preprocess_text)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])

# Function to detect language and preprocess user question
def preprocess_user_input(user_input):
    try:
        lang = detect(user_input)
        print(f"Langue détectée : {lang}")
        cleaned_input = re.sub(r'[^\w\s]', '', user_input)
        return remove_stopwords(cleaned_input, lang)
    except LangDetectException:
        print("Langue non reconnue, traitement par défaut...")
        return user_input

# Function to find the most relevant response
def get_relevant_response(user_question, top_n=1):
    # Preprocess user input
    user_question_processed = preprocess_user_input(user_question)

    # Transform user input into TF-IDF vector
    user_tfidf = vectorizer.transform([user_question_processed])

    # Compute cosine similarity with dataset
    cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix)

    # Get top N most relevant indices
    relevant_indices = cosine_sim[0].argsort()[-top_n:][::-1]

    # Fetch and print relevant rows
    if cosine_sim[0].max() == 0:  # No similarity found
        print("Désolé, je n'ai pas trouvé de réponse pertinente.")
    else:
        print("\nVoici la réponse la plus pertinente :\n")
        for idx, row in df.iloc[relevant_indices].iterrows():
            print(f"Réponse: {row['Réponse']}")
            print(f"Notes: {row['Notes']}")
            print("-" * 50)

# Interactive loop to ask user questions
print("Bienvenue! Posez une question sur vos voyages en Grèce (tapez 'exit' pour quitter).")
while True:
    user_question = input("Votre question: ")
    if user_question.lower() == "exit":
        print("Merci, à bientôt!")
        break
    get_relevant_response(user_question)


Bienvenue! Posez une question sur vos voyages en Grèce (tapez 'exit' pour quitter).
Votre question: Quels parcs nationaux visiter en Grèce?
Langue détectée : fr

Voici la réponse la plus pertinente :

Réponse: Les parcs nationaux comme Vikos-Aoos, le Parc national du Mont Olympe, et Samaria sont parfaits pour les amoureux de la nature.
Notes: Chaque parc offre des sentiers de randonnée et des paysages uniques.
--------------------------------------------------
Votre question: ما هي أفضل المواقع لمشاهدة غروب الشمس في اليونان؟
Langue détectée : ar

Voici la réponse la plus pertinente :

Réponse: أشهر مواقع غروب الشمس هي سانتوريني، أويا، وميكونوس.
Notes: أويا في سانتوريني مشهورة بشكل خاص بغروب الشمس الرائع.
--------------------------------------------------
Votre question: Quels festivals culturels ont lieu en été en Grèce ?
Langue détectée : fr

Voici la réponse la plus pertinente :

Réponse: Périclès était un homme politique et général athénien qui a dirigé pendant l'âge d'or d'Athènes 

**Streamlit app**

In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import re

# Ensure consistent language detection
DetectorFactory.seed = 0

# Language-specific stop words for Arabic and French
FRENCH_STOP_WORDS = ["le", "la", "les", "et", "est", "un", "une", "des", "à", "en", "de", "pour", "que"]
ARABIC_STOP_WORDS = ["و", "في", "من", "إلى", "على", "التي", "هو", "هي", "ما", "هذا", "ذلك", "بها"]

# Function to remove stopwords based on language
def remove_stopwords(text, lang):
    if lang == "fr":
        stop_words = FRENCH_STOP_WORDS
    elif lang == "ar":
        stop_words = ARABIC_STOP_WORDS
    else:
        return text
    words = text.split()
    return " ".join(word for word in words if word.lower() not in stop_words)

# Function to preprocess text (clean and remove stopwords)
def preprocess_text(text):
    try:
        lang = detect(text)
        cleaned_text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
        return remove_stopwords(cleaned_text, lang)
    except LangDetectException:
        return text

# Function to find the most relevant response
def get_relevant_response(user_question, vectorizer, tfidf_matrix, df, top_n=1):
    user_question_processed = preprocess_text(user_question)
    user_tfidf = vectorizer.transform([user_question_processed])
    cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix)

    if cosine_sim[0].max() == 0:
        return "Désolé, je n'ai pas trouvé de réponse pertinente."

    relevant_indices = cosine_sim[0].argsort()[-top_n:][::-1]
    results = []

    for idx in relevant_indices:
        response = df.iloc[idx]
        results.append(f"**Réponse:** {response['Réponse']}\n\n**Notes:** {response['Notes']}")

    return "\n\n---\n\n".join(results)

# Streamlit app main function
def main():
    st.title("🧳 Assistant de Voyage")
    st.write("Posez vos questions sur vos voyages en Grèce!")

    # Load dataset
    file_path = "/content/dataset.csv"
    df = pd.read_csv(file_path)
    df = df.dropna(subset=["Question", "Réponse", "Notes"]).reset_index(drop=True)
    df = df.drop_duplicates()

    # Preprocess combined text for each entry
    df["combined_text"] = df["Réponse"].astype(str) + " " + df["Notes"].astype(str)
    df["processed_text"] = df["combined_text"].apply(preprocess_text)

    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df["processed_text"])

    # Initialize session state for question and response
    if "question" not in st.session_state:
        st.session_state.question = ""
    if "response" not in st.session_state:
        st.session_state.response = ""

    # User input for the question
    question = st.text_input("❓ Votre question:", value=st.session_state.question, key="question_input")

    # Button to get the response
    if st.button("🔍 Poser la question"):
        if question:
            st.session_state.response = get_relevant_response(question, vectorizer, tfidf_matrix, df)
            st.session_state.question = ""  # Clear the input field
        else:
            st.warning("Veuillez entrer une question avant de poser.")

    # Display the response
    if st.session_state.response:
        st.text_area("📋 Résultats:", st.session_state.response, height=250)

# Run the app
if __name__ == "__main__":
    main()


Writing app.py


In [None]:
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.2.2-py3-none-any.whl.metadata (8.4 kB)
Downloading pyngrok-7.2.2-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.2


In [None]:
!ngrok config add-authtoken 2qMBELHeTjrev41KOyP190VXgJ1_3nmcYb6dwf62iRseQSBQ8


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import ngrok

# Configurer et lancer ngrok
public_url = ngrok.connect(8502)
print(f"Application accessible à l'URL : {public_url}")
!streamlit run app.py --server.port 8502

Application accessible à l'URL : NgrokTunnel: "https://1fb8-35-230-174-1.ngrok-free.app" -> "http://localhost:8502"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8502[0m
[34m  External URL: [0m[1mhttp://35.230.174.1:8502[0m
[0m


**Vectors**

In [None]:
class CSVLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load(self):
        import pandas as pd
        df = pd.read_csv(self.file_path, sep='\t')  # Use tab as the separator
        documents = []
        for index, row in df.iterrows():
            # Create a document for each row
            documents.append({
                "category": row['Catégorie'],
                "question": row['Question'],
                "answer": row['Réponse'],
                "source": row['Source'],
                "language": row['Langue'],
                "notes": row['Notes']
            })
        return documents

In [None]:

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings


class DocumentManager:
    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings(model_name=Config["MODEL_NAME"])
        self.text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        self.documents = self.load_documents()

    def load_documents(self):
        csv_loader = CSVLoader(file_path="/content/dataset.csv")
        docs = csv_loader.load()
        # Create a list of text documents for the vector store
        text_docs = [f"Question: {doc['question']}\nRéponse: {doc['answer']}" for doc in docs]
        return self.text_splitter.split_documents(text_docs)

    def setup_vectorstore(self):
        vectorstore = Chroma.from_documents(
            self.documents,
            self.embeddings,
            persist_directory=Config["VECTOR_SAVE_PATH"]
        )
        return vectorstore


**Tools**

In [None]:
!pip install typing


Collecting typing
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/78.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: typing
  Building wheel for typing (setup.py) ... [?25l[?25hdone
  Created wheel for typing: filename=typing-3.7.4.3-py3-none-any.whl size=26303 sha256=8d02c38728716556146bd19abd6b9c8fda7e3b2ef9cd1bd9c60189740ec5eda9
  Stored in directory: /root/.cache/pip/wheels/7c/d0/9e/1f26ebb66d9e1732e4098bc5a6c2d91f6c9a529838f0284890
Successfully built typing
Installing collected packages: typing
Successfully installed typing-3.7.4.3


In [None]:
def load_dataset(file_path: str):
    df = pd.read_csv(file_path)  # Adjust the separator if needed
    return df.to_dict(orient='records')  # Convert to a list of dictionaries

# Define the TravelTools class
class TravelTools:
    def __init__(self, dataset: List[Dict]):
        # Initialize with the dataset directly
        self.offers = dataset

    def search_offers(self, query: str) -> List[Dict]:
        matches = []
        query_terms = query.lower().split()

        for offer in self.offers:
            if any(term in offer['Réponse'].lower() for term in query_terms):
                matches.append(offer)

    # Return a simple, parseable list format
        return [{"Catégorie": offer['category'], "Réponse": offer['Réponse']} for offer in matches]


    def check_availability(self, destination: str, dates: str) -> bool:
        """Check if an offer is available for a specific destination and dates."""
        # This method can be customized based on your dataset structure
        # For now, we will return False as we don't have availability data
        return False

    def get_tools(self):
        # Return the tools as a list
        return [
            Tool(
                name="SearchOffers",
                func=self.search_offers,
                description="Search for travel offers"
            ),
            Tool(
                name="CheckAvailability",
                func=self.check_availability,
                description="Check availability"
            )
        ]

**Agent config**

In [None]:
!pip install langdetect
!pip install langchain_huggingface

Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain_huggingface
Successfully installed langchain_huggingface-0.1.2


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.agents import initialize_agent, AgentType
from langchain.tools import Tool
from langchain_huggingface import HuggingFacePipeline
from typing import List, Dict
import pandas as pd
import traceback


# Load dataset function
def load_dataset(file_path: str) -> List[Dict]:
    """Load the dataset from a TSV or CSV file."""
    df = pd.read_csv(file_path, sep='\t')  # Adjust separator if necessary
    return df.to_dict(orient='records')  # Convert to list of dictionaries


# Define the TravelTools class
class TravelTools:
    def __init__(self, dataset: List[Dict]):
        self.offers = dataset

    def search_offers(self, query: str) -> List[Dict]:
        """Search for travel offers based on user query."""
        matches = []
        query_terms = query.lower().split()

        for offer in self.offers:
            if any(term in offer['Réponse'].lower() for term in query_terms):
                matches.append({
                    "Catégorie": offer['Catégorie'],
                    "Question": offer['Question'],
                    "Réponse": offer['Réponse'],
                    "Source": offer['Source'],
                    "Langue": offer['Langue'],
                    "Notes": offer['Notes']
                })

        return matches

    def check_availability(self, destination: str, dates: str) -> bool:
        """Check if an offer is available for a specific destination and dates (placeholder)."""
        # Placeholder implementation: always returns False
        return False

    def get_tools(self):
        """Return the tools as a list for the agent."""
        return [
            Tool(
                name="SearchOffers",
                func=self.search_offers,
                description="Search for travel offers based on a query."
            ),
            Tool(
                name="CheckAvailability",
                func=self.check_availability,
                description="Check availability for a specific destination and dates."
            )
        ]


import re

class TravelAgent:
    def __init__(self, dataset: List[Dict]):
        # Load the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load the model with is_decoder=True
        self.model = AutoModelForCausalLM.from_pretrained("xlm-roberta-base", is_decoder=True)

        # Create the pipeline
        self.pipeline = self._create_pipeline()

        # Initialize tools
        self.tools = TravelTools(dataset).get_tools()

        # Initialize the agent
        self.agent = initialize_agent(
            llm=self.pipeline,
            tools=self.tools,
            agent_type=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
            verbose=True,
            handle_parsing_errors=True,
            agent_kwargs={
                "input_keys": ["input", "context"],
                "max_iterations": 3,
                "max_execution_time": 30
            }
        )

    def _create_pipeline(self):
        """Create a text-generation pipeline."""
        model_pipeline = pipeline(
            task="text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=self.device,
            max_new_tokens=100,
            temperature=0.7,
            num_return_sequences=1,
            do_sample=True,
            truncation=True
        )
        return HuggingFacePipeline(pipeline=model_pipeline)

    def get_response(self, user_input: str, context: list) -> str:
        """
        Generate a response by searching for keywords in the 'Réponse' and 'Notes' columns
        and combining them for both French and Arabic text.
        """
        try:
         # Normalize the input to lowercase for easier matching
            user_input_normalized = user_input.lower()

        # Initialize the response accumulator
            combined_response = []

        # Access the dataset directly from the TravelTools instance
            for offer in self.travel_tools.offers:  # Use self.travel_tools.offers instead of self.tools[0].offers
                response_text = offer['Réponse'].lower()
                notes_text = offer['Notes'].lower()

            # Match the keywords from the user input with 'Réponse' and 'Notes'
                if any(re.search(r'\b' + re.escape(term) + r'\b', response_text) for term in user_input_normalized.split()) or \
                   any(re.search(r'\b' + re.escape(term) + r'\b', notes_text) for term in user_input_normalized.split()):
                # If a match is found, combine the 'Réponse' and 'Notes' columns into the response
                    combined_response.append(f"Réponse: {offer['Réponse']}")
                    combined_response.append(f"Notes: {offer['Notes']}")

        # Combine all the relevant responses found in the dataset
            if combined_response:
                final_response = "\n".join(combined_response)
            else:
                final_response = "Désolé, aucune correspondance n'a été trouvée."

        # Return the final response
            return final_response

        except Exception as e:
        # Handle any errors and display detailed information
            print(f"Erreur dans get_response: {type(e).__name__}: {str(e)}")
            print("Trace complète:", traceback.format_exc())
            return "Une erreur est survenue. Veuillez réessayer."


    def _prepare_input(self, text: str, max_length: int) -> str:
        """
        Truncate the text to respect the specified maximum length.
        """
        tokens = self.tokenizer.encode(text)
        truncated_tokens = tokens[:max_length]
        return self.tokenizer.decode(truncated_tokens)


**Test**

In [None]:
# Load the dataset
dataset = load_dataset("/content/dataset.csv")  # Adjust the path as necessary

# Initialize the TravelAgent with the dataset
travel_agent = TravelAgent(dataset)

# Define the user input and context
context = ["L'utilisateur veut savoir les meilleurs mois pour visiter la Grèce."]
user_input = "Quels sont les meilleurs mois pour visiter les îles grecques?"

# Get the response
response = travel_agent.get_response(user_input, context)
print("Réponse :", response)


Erreur dans get_response: AttributeError: 'TravelAgent' object has no attribute 'travel_tools'
Trace complète: Traceback (most recent call last):
  File "<ipython-input-57-ee65a3468a14>", line 121, in get_response
    for offer in self.travel_tools.offers:  # Use self.travel_tools.offers instead of self.tools[0].offers
AttributeError: 'TravelAgent' object has no attribute 'travel_tools'

Réponse : Une erreur est survenue. Veuillez réessayer.


In [None]:
def search_faiss(query, top_k=3):
    # Convert query into embedding
    query_embedding = model.encode([query])

    # Search FAISS index
    distances, indices = index.search(np.array(query_embedding), top_k)

    # Retrieve top results
    results = df.iloc[indices[0]]
    return results["combined"].tolist()

# Example query
query = "Quels sont les parcs naturels en Grèce ?"  # Replace with your question
faiss_index = faiss.read_index("/content/vectors.index")  # Reload FAISS index
answers = search_faiss(query)
print("Top Results:")
for ans in answers:
    print(ans)


Top Results:
ما هي الحدائق الوطنية التي يجب زيارتها في اليونان؟ حديقة جبل أوليمبوس وحديقة فيكوس-آوس. توفر هذه الحدائق مناظر طبيعية خلابة.
ما هي المنتزهات الطبيعية التي يجب زيارتها في اليونان؟ منتزه فيكوس-آؤوس الوطني ومنتزه ساماريا الطبيعي. تقدم هذه المنتزهات مناظر طبيعية خلابة وتنوعًا بيولوجيًا غنيًا.
Quels sont les parcs nationaux de Grèce à visiter ? Le parc national du Mont Olympe et le parc national de Samaria. Ces parcs offrent des paysages magnifiques et sont idéaux pour les randonnées.


**Answer generation + LangChain**

In [None]:
!pip install -U langchain langchain-community langchain-huggingface


Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [None]:
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Load your dataset
df = pd.read_csv("/content/train.csv")  # replace with path to your dataset

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-xlm-r-multilingual-v1")

# Create documents for each row in the dataset
documents = []
for _, row in df.iterrows():
    # Combine Question and Answer into a single text for each document
    document_text = f"Question: {row['Question']} Answer: {row['Réponse']}"
    documents.append(Document(page_content=document_text, metadata={"source": row['Source'], "langue": row['Langue']}))

# Create FAISS vectorstore from the documents
vectorstore = FAISS.from_documents(documents, embedding_model)

# Save the FAISS index locally
vectorstore.save_local("/content/")
print("FAISS index saved successfully!")


FAISS index saved successfully!


In [None]:
# Query from the user
query = "Quels sont les meilleurs sites touristiques en Grèce ?"

# Perform a similarity search in the FAISS index
results = vectorstore.similarity_search(query, k=3)  # k = 3 for top 3 matches

# Display the most similar documents (answers)
for result in results:
    print(result.page_content)

Question: Quels sont les meilleurs spots pour faire de la randonnée en Grèce? Answer: Le Mont Olympe, le massif du Pindus, et les gorges de Vikos sont parmi les meilleurs endroits pour randonner en Grèce.
Question: ما هي أشهر المواقع الأثرية في اليونان؟ Answer: دلفي، وأولمبيا، وموكناي.
Question: ما هي أفضل الأماكن للتنزه في اليونان؟ Answer: جبل أوليمبوس، وديان الساماريا في كريت، وجبل أثوس (للرجال فقط) من بين أفضل الأماكن للتنزه.
