In [None]:
import os
import re
import uuid
import fitz
import chromadb
import tiktoken
from tqdm import tqdm

from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from chromadb.config import Settings

enc = tiktoken.get_encoding("o200k_base")


class BDDChunks:
    """
    A class to handle operations related to chunking text data, embedding, and storing in a ChromaDB instance.

    This class provides methods to:
    - Read text from PDF files.
    - Split the text into smaller chunks for processing.
    - Create a ChromaDB collection with embeddings for the chunks.
    - Add these chunks and their embeddings to the ChromaDB collection.
    """

    def __init__(self, nom: str, embedding_model: str, db:):
        """
        Initialize a BDDChunks instance.

        Args:
            embedding_model (str): The name of the embedding model to use for generating embeddings.
            path (str): The file path to the PDF or dataset to process.
        """
        self.nom
        self.path = path
        self.chunks: list[str] | None = None
        self.client = chromadb.PersistentClient(
            path="./ChromaDB", settings=Settings(anonymized_telemetry=False)
        )
        self.embedding_name = embedding_model
        self.embeddings = SentenceTransformerEmbeddingFunction(
            model_name=embedding_model
        )
        self.chroma_db = None

    def _create_collection(self, path: str) -> None:
        """
        Create a new ChromaDB collection for storing embeddings.

        Args:
            path (str): The name of the collection to create in ChromaDB.
        """
        # Tester qu'en changeant de path, on accède pas au reste
        file_name = "a" + os.path.basename(path)[0:50].strip() + "a"
        file_name = re.sub(r"\s+", "-", file_name)
        # Expected collection name that (1) contains 3-63 characters, (2) starts and ends with an alphanumeric character, (3) otherwise contains only alphanumeric characters, underscores or hyphens (-), (4) contains no two consecutive periods (..)
        self.chroma_db = self.client.get_or_create_collection(name=file_name, embedding_function=self.embeddings, metadata={"hnsw:space": "cosine"})  # type: ignore

    def read_pdf(self, file_path: str) -> str:
        """
        Reads the content of a PDF file, excluding the specified number of pages from the start and end.

        Args:
            file_path (str): The path to the PDF file.

        Returns:
            str: The extracted text from the specified pages of the PDF.
        """
        doc = fitz.open(file_path)
        text = str()
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()  # type: ignore
        return text  # type: ignore

    def split_text_into_chunks(self, corpus: str, chunk_size: int = 500) -> list[str]:
        """
        Splits a given text corpus into chunks of a specified size.

        Args:
            corpus (str): The input text corpus to be split into chunks.
            chunk_size (int, optional): The size of each chunk. Defaults to 500.

        Returns:
            list[str]: A list of text chunks.
        """
        tokenized_corpus = enc.encode(corpus)
        chunks = [
            "".join(enc.decode(tokenized_corpus[i : i + chunk_size]))
            for i in tqdm(range(0, len(tokenized_corpus), chunk_size))
        ]

        return chunks

    def add_embeddings(self, list_chunks: list[str], batch_size: int = 100) -> None:
        """
        Add embeddings for text chunks to the ChromaDB collection.

        Args:
            list_chunks (list[str]): A list of text chunks to embed and add to the collection.
            batch_size (int, optional): The batch size for adding documents to the collection. Defaults to 100.

        Note:
            ChromaDB supports a maximum of 166 documents per batch.
        """
        if len(list_chunks) < batch_size:
            batch_size_for_chromadb = len(list_chunks)
        else:
            batch_size_for_chromadb = batch_size

        document_ids: list[str] = []

        for i in tqdm(
            range(0, len(list_chunks), batch_size_for_chromadb)
        ):  # On met en place une stratégie d'ajout par batch car ChromaDB ne supporte pas plus de 166 documents d'un coup.
            batch_documents = list_chunks[i : i + batch_size_for_chromadb]
            list_ids = [
                str(id_chunk) for id_chunk in list(range(i, i + len(batch_documents)))
            ]
            list_id_doc = [str(uuid.uuid4()) for x in list_ids]
            self.chroma_db.add(documents=batch_documents, ids=list_id_doc)  # type: ignore
            document_ids.extend(list_ids)

    def __call__(self) -> None:
        """
        Execute the entire process of reading, chunking, creating a collection, and adding embeddings.

        This method:
        1. Reads the text from the specified PDF file.
        2. Splits the text into chunks.
        3. Creates a ChromaDB collection for storing the embeddings.
        4. Adds the text chunks and their embeddings to the ChromaDB collection.
        """
        corpus = self.read_pdf(file_path=self.path)
        chunks = self.split_text_into_chunks(corpus=corpus)
        self._create_collection(path=self.path)
        self.add_embeddings(list_chunks=chunks)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
from sqlalchemy.orm import Session
from typing import List
from model import models, schemas
from utils import database
import math

# Database dependency
def get_db():
    db = database.SessionLocal()
    try:
        yield db
    finally:
        db.close()


def join_restaurant_avis_date(db: Session) -> pd.DataFrame:
    # Join the DimRestaurant, FactAvis, and DimDate and select the fields
    query = db.query(models.FaitAvis, models.DimRestaurant, models.DimDate)\
              .join(models.DimRestaurant, models.FaitAvis.id_restaurant == models.DimRestaurant.id_restaurant)

In [26]:
import math

def haversine(lat1, lon1, lat2, lon2):
    """
    Calcule la distance entre deux points géographiques en utilisant la formule de Haversine.

    :param lat1: Latitude du premier point (en degrés).
    :param lon1: Longitude du premier point (en degrés).
    :param lat2: Latitude du deuxième point (en degrés).
    :param lon2: Longitude du deuxième point (en degrés).
    :return: Distance entre les deux points (en kilomètres).
    """
    # Rayon de la Terre en kilomètres
    R = 6371.0

    # Convertir les degrés en radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Différences de latitude et de longitude
    delta_lat = lat2_rad - lat1_rad
    delta_lon = lon2_rad - lon1_rad

    # Formule de Haversine
    a = math.sin(delta_lat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c

    return distance

# Exemple d'utilisation
lat1, lon1 = 48.8566, 2.3522  # Paris
lat2, lon2 = 51.5074, -0.1278  # Londres

distance = haversine(lat1, lon1, lat2, lon2)
print(f"La distance entre Paris et Londres est d'environ {distance:.2f} km.")


La distance entre Paris et Londres est d'environ 343.56 km.


In [3]:
from sentence_transformers import SentenceTransformer, util

# Charger un modèle pour générer des embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Petit modèle rapide et efficace

def calculate_similarity(text1: str, text2: str) -> float:
    # Générer les embeddings pour les deux textes
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)
    
    # Calculer la similarité cosinus
    similarity = util.cos_sim(embedding1, embedding2)
    
    return similarity.item()  # Retourne un score de similarité (0 à 1)

# Champs texte
# text1 = input("Entrez le premier texte : ")
# text2 = input("Entrez le deuxième texte : ")
text1 = "Bonjour, je suis un texte de test je veux  1  franx"
text2 = "Bonjour, je suis un autre texte de test"
# Calcul de la similarité
similarity_score = calculate_similarity(text1, text2)

print(f"La similarité entre les deux textes est : {similarity_score:.2f}")


  from .autonotebook import tqdm as notebook_tqdm


La similarité entre les deux textes est : 0.88


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
from sqlalchemy.orm import Session
from typing import List
from model import models, schemas
from utils import database
import math

# Database dependency
def get_db():
    db = database.SessionLocal()
    try:
        yield db
    finally:
        db.close()


def join_restaurant_avis_date(db: Session) -> pd.DataFrame:
    # Join the DimRestaurant, FactAvis, and DimDate and select the fields
    query = db.query(models.FaitAvis, models.DimRestaurant, models.DimDate)\
              .join(models.DimRestaurant, models.FaitAvis.id_restaurant == models.DimRestaurant.id_restaurant)
              

    # Execute the query
    results = query.all()
    
    # Create a DataFrame with the results
    df = pd.DataFrame([{
        "restaurant": restaurant.nom,
        "type_cuisines": restaurant.type_cuisines,
        "fonctionnalites": restaurant.fonctionnalites,
        "infos_pratiques": restaurant.infos_pratiques,
        "classement": restaurant.classement,
        
    } for avis, restaurant, date in results])
    
    return df






# Initialize lemmatizer
lem = WordNetLemmatizer()

# List of stopwords
mots_vides = stopwords.words("french")

# List of punctuations and digits
ponctuations = string.punctuation
chiffres = string.digits

# Function to clean and tokenize document
def nettoyage_doc(doc_param):
    # Convert to lowercase
    doc = doc_param.lower()
    # Remove punctuations
    doc = "".join([w for w in list(doc) if not w in ponctuations])
    # Remove digits
    # doc = "".join([w for w in list(doc) if not w in chiffres])
    # Tokenize the document
    doc = word_tokenize(doc)
    # Lemmatize each term
    doc = [lem.lemmatize(terme) for terme in doc]
    # Remove stopwords
    doc = [w for w in doc if not w in mots_vides]
    # Remove terms with less than 3 characters if they are not numeric
    doc = [w for w in doc if len(w) >= 3 or w.isnumeric()]
    return doc



def haversine(lat1, lon1, lat2, lon2):
    """
    Calcule la distance entre deux points géographiques en utilisant la formule de Haversine.

    :param lat1: Latitude du premier point (en degrés).
    :param lon1: Longitude du premier point (en degrés).
    :param lat2: Latitude du deuxième point (en degrés).
    :param lon2: Longitude du deuxième point (en degrés).
    :return: Distance entre les deux points (en kilomètres).
    """
    # Rayon de la Terre en kilomètres
    R = 6371.0

    # Convertir les degrés en radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Différences de latitude et de longitude
    delta_lat = lat2_rad - lat1_rad
    delta_lon = lon2_rad - lon1_rad

    # Formule de Haversine
    a = math.sin(delta_lat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c

    return distance

def calculate_similarity_notes(note1: float, note2: float) -> float:
    # Calculate the absolute difference between the two notes
    diff = abs(note1 - note2)
    
    # Normalize the difference to a similarity score (0 to 1)
    similarity = 1 / (1 + diff)
    return similarity

def calculate_similarity_texte(text1: str, text2: str) -> float:
    # Clean and tokenize the texts
    doc1 = nettoyage_doc(text1)
    doc2 = nettoyage_doc(text2)
    
    # Join tokens back to strings
    text1 = " ".join(doc1)
    text2 = " ".join(doc2)
    
    # print(f"Texte 1 nettoyé : {text1}")
    # print(f"Texte 2 nettoyé : {text2}")

    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0]  # Return the similarity score (0 to 1)



def build_similarity_matrix(data , method) -> pd.DataFrame:
    # Initialize an empty DataFrame
    df = pd.DataFrame()
    
    # Loop through all pairs of texts
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            # Calculate similarity between the two texts
            if method == "texte":
                similarity = calculate_similarity_texte(data[i], data[j])
            elif method == "notes":
                similarity = calculate_similarity_notes(data[i], data[j])
            elif method == "geographique":
                similarity = haversine(data[i][0], data[i][1], data[j][0], data[j][1])
            # Add the similarity to the DataFrame
            df = df.append({
                "restaurant1": data[i],
                "restaurant2": data[j],
                "similarity": similarity
            }, ignore_index=True)
    
    return df


# def construireMesMatrix(db):
#     data = join_restaurant_avis_date( db)
#     # print(data)
#     return data

# #cconstruction  de la matrice de similarité
# def build_similarity_matrix(texts: List[str]) -> pd.DataFrame:
#     # Initialize an empty DataFrame
#     df = pd.DataFrame()
    
#     # Loop through all pairs of texts
#     for i in range(len(texts)):
#         for j in range(i+1, len(texts)):
#             # Calculate similarity between the two texts
#             similarity = calculate_similarity(texts[i], texts[j])
            
#             # Add the similarity to the DataFrame
#             df = df.append({
#                 "text1": texts[i],
#                 "text2": texts[j],
#                 "similarity": similarity
#             }, ignore_index=True)
    
#     return df

# Example texts
# text1 = "Bonjour, je suis un texte de test"
# text2 = "Bonjour, je suis un autre texte de test"



# Calculate similarity
# similarity_score = calculate_similarity(text1, text2)
# print(f"La similarité entre les deux textes est : {similarity_score:.2f}")
# db = next(get_db())
# contruireDF = join_restaurant_avis_date(db)
# print(contruireDF.head())

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
from sqlalchemy.orm import Session
from typing import List
from model import models, schemas
from utils import database


# Database dependency
def get_db():
    db = database.SessionLocal()
    try:
        yield db
    finally:
        db.close()

def join_restaurant_avis_date(db: Session) -> pd.DataFrame:
    # Join the DimRestaurant and FactAvis and data and select the fields
    query = db.query(models.FactAvis, models.DimRestaurant).join(models.DimRestaurant)

    # Execute the query
    
    
    results = query.all()
    
    # Create a DataFrame with the results
    df = pd.DataFrame([{
        "avis": avis.commentaire,
        "date": avis.date,
        "restaurant": restaurant.nom
    } for avis, restaurant in results])
    
    return df



# Initialize lemmatizer
lem = WordNetLemmatizer()

# List of stopwords
mots_vides = stopwords.words("french")

# List of punctuations and digits
ponctuations = string.punctuation
chiffres = string.digits

# Function to clean and tokenize document
def nettoyage_doc(doc_param):
    # Convert to lowercase
    doc = doc_param.lower()
    # Remove punctuations
    doc = "".join([w for w in list(doc) if not w in ponctuations])
    # Remove digits
    # doc = "".join([w for w in list(doc) if not w in chiffres])
    # Tokenize the document
    doc = word_tokenize(doc)
    # Lemmatize each term
    doc = [lem.lemmatize(terme) for terme in doc]
    # Remove stopwords
    doc = [w for w in doc if not w in mots_vides]
    # Remove terms with less than 3 characters if they are not numeric
    doc = [w for w in doc if len(w) >= 3 or w.isnumeric()]
    return doc

def calculate_similarity(text1: str, text2: str) -> float:
    # Clean and tokenize the texts
    doc1 = nettoyage_doc(text1)
    doc2 = nettoyage_doc(text2)
    
    # Join tokens back to strings
    text1 = " ".join(doc1)
    text2 = " ".join(doc2)
    
    print(f"Texte 1 nettoyé : {text1}")
    print(f"Texte 2 nettoyé : {text2}")

    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0]  # Return the similarity score (0 to 1)

# Example texts
text1 = "Bonjour, je suis un texte de test"
text2 = "Bonjour, je suis un autre texte de test"

# Calculate similarity
similarity_score = calculate_similarity(text1, text2)

print(f"La similarité entre les deux textes est : {similarity_score:.2f}")

Texte 1 nettoyé : bonjour texte test
Texte 2 nettoyé : bonjour autre texte test
La similarité entre les deux textes est : 0.78
