In [1]:
from model import models, schemas
from utils import database
import os
import uuid
import sqlite3
from tqdm import tqdm


class BDDChunksSQLite:
    """
    A class to process reviews from a SQLite database and store them as chunks with embeddings.
    
    Each review is considered a single chunk.
    """

    def __init__(self, db = database.SessionLocal()):
        """
        Initialize the BDDChunksSQLite instance.

        Args:
            sqlite_db_path (str): Path to the SQLite database file.
            reviews_table (str): Name of the table containing reviews and restaurant information.
            embeddings_table (str): Name of the table where embeddings will be stored.
        """
        # self.sqlite_db_path = sqlite_db_path
        # self.reviews_table = reviews_table
        # self.embeddings_table = embeddings_table
        self.db = self.get_db()

    def get_db(self):
        return database.SessionLocal()
    

    def fetch_reviews_from_db(self) -> list[tuple[str, str]]:
        """
        Fetch reviews and their associated restaurant names from the SQLite database.

        Returns:
            list[tuple[str, str]]: A list of tuples containing restaurant names and reviews.
        """
        conn = sqlite3.connect(self.sqlite_db_path)
        cursor = conn.cursor()
        query = f"SELECT restaurant_name, review FROM {self.reviews_table};"
        cursor.execute(query)
        data = cursor.fetchall()
        conn.close()
        return data

    def generate_fake_embedding(self, text: str) -> list[float]:
        """
        Generate a fake embedding for a given text. Replace this with your embedding logic.

        Args:
            text (str): The text for which to generate an embedding.

        Returns:
            list[float]: A list representing the embedding vector.
        """
        # Example: Return the length of each word in the text as a fake embedding.
        return [len(word) for word in text.split()]

    def store_chunk_in_db(self, restaurant_name: str, chunk: str, embedding: list[float]) -> None:
        """
        Store a chunk and its embedding in the SQLite database.

        Args:
            restaurant_name (str): The name of the associated restaurant.
            chunk (str): The text chunk (in this case, the full review).
            embedding (list[float]): The embedding vector for the chunk.
        """
        conn = sqlite3.connect(self.sqlite_db_path)
        cursor = conn.cursor()

        # Ensure the embeddings table exists
        cursor.execute(f"""
        CREATE TABLE IF NOT EXISTS {self.embeddings_table} (
            id TEXT PRIMARY KEY,
            restaurant_name TEXT,
            chunk TEXT,
            embedding TEXT
        );
        """)

        embedding_str = ",".join(map(str, embedding))
        cursor.execute(
            f"INSERT INTO {self.embeddings_table} (id, restaurant_name, chunk, embedding) VALUES (?, ?, ?, ?);",
            (str(uuid.uuid4()), restaurant_name, chunk, embedding_str),
        )
        conn.commit()
        conn.close()

    def process_reviews(self) -> None:
        """
        Process each review as a single chunk, generate embeddings, and store them in the database.

        This method:
        1. Fetches reviews and restaurant names from the SQLite database.
        2. Generates embeddings for each review.
        3. Stores the reviews and embeddings in the SQLite database.
        """
        data = self.fetch_reviews_from_db()

        for restaurant_name, review in tqdm(data, desc="Processing Reviews"):
            embedding = self.generate_fake_embedding(review)  # Generate embedding for the review (chunk)
            self.store_chunk_in_db(restaurant_name, review, embedding)


# Example usage
# sqlite_db_path = "path/to/your/database.sqlite"  # Path to your SQLite DB
# reviews_table = "reviews"  # Table containing reviews
# embeddings_table = "embeddings"  # Table to store embeddings

bdd_chunks = BDDChunksSQLite(sqlite_db_path, reviews_table, embeddings_table)
bdd_chunks.process_reviews()


TypeError: __init__() takes 1 positional argument but 4 were given

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
from sqlalchemy.orm import Session
from typing import List
from model import models, schemas
from utils import database
import math

# Database dependency
def get_db():
    db = database.SessionLocal()
    try:
        yield db
    finally:
        db.close()


def join_restaurant_avis_date(db: Session) -> pd.DataFrame:
    # Join the DimRestaurant, FactAvis, and DimDate and select the fields
    query = db.query(models.DimLocation, models.DimRestaurant)\
        .join(models.DimRestaurant, models.DimLocation.id_restaurant == models.DimRestaurant.id_restaurant)

    # Execute the query
    results = query.all()
        # Create a DataFrame with the results
    df = pd.DataFrame([{
        "restaurant": restaurant.nom,
        "type_cuisines": restaurant.type_cuisines,
        "fonctionnalites": restaurant.fonctionnalites,
        "infos_pratiques": restaurant.infos_pratiques,
        "classement": restaurant.classement,
        'longitude': location.longitude,
        'latitude': location.latitude
        
    } for restaurant, location in results])
    
    return df


db = next(get_db())
df = join_restaurant_avis_date(db)
# df = df.drop(columns=['id_avis', 'id_restaurant', 'id_date'])
df.head()

AttributeError: type object 'DimLocation' has no attribute 'id_restaurant'

In [26]:
import math

def haversine(lat1, lon1, lat2, lon2):
    """
    Calcule la distance entre deux points géographiques en utilisant la formule de Haversine.

    :param lat1: Latitude du premier point (en degrés).
    :param lon1: Longitude du premier point (en degrés).
    :param lat2: Latitude du deuxième point (en degrés).
    :param lon2: Longitude du deuxième point (en degrés).
    :return: Distance entre les deux points (en kilomètres).
    """
    # Rayon de la Terre en kilomètres
    R = 6371.0

    # Convertir les degrés en radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Différences de latitude et de longitude
    delta_lat = lat2_rad - lat1_rad
    delta_lon = lon2_rad - lon1_rad

    # Formule de Haversine
    a = math.sin(delta_lat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c

    return distance

# Exemple d'utilisation
lat1, lon1 = 48.8566, 2.3522  # Paris
lat2, lon2 = 51.5074, -0.1278  # Londres

distance = haversine(lat1, lon1, lat2, lon2)
print(f"La distance entre Paris et Londres est d'environ {distance:.2f} km.")


La distance entre Paris et Londres est d'environ 343.56 km.


In [3]:
from sentence_transformers import SentenceTransformer, util

# Charger un modèle pour générer des embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Petit modèle rapide et efficace

def calculate_similarity(text1: str, text2: str) -> float:
    # Générer les embeddings pour les deux textes
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)
    
    # Calculer la similarité cosinus
    similarity = util.cos_sim(embedding1, embedding2)
    
    return similarity.item()  # Retourne un score de similarité (0 à 1)

# Champs texte
# text1 = input("Entrez le premier texte : ")
# text2 = input("Entrez le deuxième texte : ")
text1 = "Bonjour, je suis un texte de test je veux  1  franx"
text2 = "Bonjour, je suis un autre texte de test"
# Calcul de la similarité
similarity_score = calculate_similarity(text1, text2)

print(f"La similarité entre les deux textes est : {similarity_score:.2f}")


  from .autonotebook import tqdm as notebook_tqdm


La similarité entre les deux textes est : 0.88


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
from sqlalchemy.orm import Session
from typing import List
from model import models, schemas
from utils import database
import math

# Database dependency
def get_db():
    db = database.SessionLocal()
    try:
        yield db
    finally:
        db.close()


def join_restaurant_avis_date(db: Session) -> pd.DataFrame:
    # Join the DimRestaurant, FactAvis, and DimDate and select the fields
    query = db.query(models.FaitAvis, models.DimRestaurant, models.DimDate)\
              .join(models.DimRestaurant, models.FaitAvis.id_restaurant == models.DimRestaurant.id_restaurant)
              

    # Execute the query
    results = query.all()
    
    # Create a DataFrame with the results
    df = pd.DataFrame([{
        "restaurant": restaurant.nom,
        "type_cuisines": restaurant.type_cuisines,
        "fonctionnalites": restaurant.fonctionnalites,
        "infos_pratiques": restaurant.infos_pratiques,
        "classement": restaurant.classement,
        
    } for avis, restaurant, date in results])
    
    return df






# Initialize lemmatizer
lem = WordNetLemmatizer()

# List of stopwords
mots_vides = stopwords.words("french")

# List of punctuations and digits
ponctuations = string.punctuation
chiffres = string.digits

# Function to clean and tokenize document
def nettoyage_doc(doc_param):
    # Convert to lowercase
    doc = doc_param.lower()
    # Remove punctuations
    doc = "".join([w for w in list(doc) if not w in ponctuations])
    # Remove digits
    doc = "".join([w for w in list(doc) if not w in chiffres])
    Tokenize the document
    doc = word_tokenize(doc)
    # Lemmatize each term
    doc = [lem.lemmatize(terme) for terme in doc]
    # Remove stopwords
    doc = [w for w in doc if not w in mots_vides]
    # Remove terms with less than 3 characters if they are not numeric
    doc = [w for w in doc if len(w) >= 3 or w.isnumeric()]
    return doc



def haversine(lat1, lon1, lat2, lon2):
    """
    Calcule la distance entre deux points géographiques en utilisant la formule de Haversine.

    :param lat1: Latitude du premier point (en degrés).
    :param lon1: Longitude du premier point (en degrés).
    :param lat2: Latitude du deuxième point (en degrés).
    :param lon2: Longitude du deuxième point (en degrés).
    :return: Distance entre les deux points (en kilomètres).
    """
    # Rayon de la Terre en kilomètres
    R = 6371.0

    # Convertir les degrés en radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Différences de latitude et de longitude
    delta_lat = lat2_rad - lat1_rad
    delta_lon = lon2_rad - lon1_rad

    # Formule de Haversine
    a = math.sin(delta_lat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c

    return distance

def calculate_similarity_notes(note1: float, note2: float) -> float:
    # Calculate the absolute difference between the two notes
    diff = abs(note1 - note2)
    
    # Normalize the difference to a similarity score (0 to 1)
    similarity = 1 / (1 + diff)
    return similarity

def calculate_similarity_texte(text1: str, text2: str) -> float:
    # Clean and tokenize the texts
    doc1 = nettoyage_doc(text1)
    doc2 = nettoyage_doc(text2)
    
    # Join tokens back to strings
    text1 = " ".join(doc1)
    text2 = " ".join(doc2)
    
    # print(f"Texte 1 nettoyé : {text1}")
    # print(f"Texte 2 nettoyé : {text2}")

    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0]  # Return the similarity score (0 to 1)



def build_similarity_matrix(data , method) -> pd.DataFrame:
    # Initialize an empty DataFrame
    df = pd.DataFrame()
    
    # Loop through all pairs of texts
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            # Calculate similarity between the two texts
            if method == "texte":
                similarity = calculate_similarity_texte(data[i], data[j])
            elif method == "notes":
                similarity = calculate_similarity_notes(data[i], data[j])
            elif method == "geographique":
                similarity = haversine(data[i][0], data[i][1], data[j][0], data[j][1])
            # Add the similarity to the DataFrame
            df = df.append({
                "restaurant1": data[i],
                "restaurant2": data[j],
                "similarity": similarity
            }, ignore_index=True)
    
    return df


# def construireMesMatrix(db):
#     data = join_restaurant_avis_date( db)
#     # print(data)
#     return data

# #cconstruction  de la matrice de similarité
# def build_similarity_matrix(texts: List[str]) -> pd.DataFrame:
#     # Initialize an empty DataFrame
#     df = pd.DataFrame()
    
#     # Loop through all pairs of texts
#     for i in range(len(texts)):
#         for j in range(i+1, len(texts)):
#             # Calculate similarity between the two texts
#             similarity = calculate_similarity(texts[i], texts[j])
            
#             # Add the similarity to the DataFrame
#             df = df.append({
#                 "text1": texts[i],
#                 "text2": texts[j],
#                 "similarity": similarity
#             }, ignore_index=True)
    
#     return df

# Example texts
# text1 = "Bonjour, je suis un texte de test"
# text2 = "Bonjour, je suis un autre texte de test"



# Calculate similarity
# similarity_score = calculate_similarity(text1, text2)
# print(f"La similarité entre les deux textes est : {similarity_score:.2f}")
# db = next(get_db())
# contruireDF = join_restaurant_avis_date(db)
# print(contruireDF.head())

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
from sqlalchemy.orm import Session
from typing import List
from model import models, schemas
from utils import database


# Database dependency
def get_db():
    db = database.SessionLocal()
    try:
        yield db
    finally:
        db.close()

def join_restaurant_avis_date(db: Session) -> pd.DataFrame:
    # Join the DimRestaurant and FactAvis and data and select the fields
    query = db.query(models.FactAvis, models.DimRestaurant).join(models.DimRestaurant)

    # Execute the query
    
    
    results = query.all()
    
    # Create a DataFrame with the results
    df = pd.DataFrame([{
        "avis": avis.commentaire,
        "date": avis.date,
        "restaurant": restaurant.nom
    } for avis, restaurant in results])
    
    return df



# Initialize lemmatizer
lem = WordNetLemmatizer()

# List of stopwords
mots_vides = stopwords.words("french")

# List of punctuations and digits
ponctuations = string.punctuation
chiffres = string.digits

# Function to clean and tokenize document
def nettoyage_doc(doc_param):
    # Convert to lowercase
    doc = doc_param.lower()
    # Remove punctuations
    doc = "".join([w for w in list(doc) if not w in ponctuations])
    # Remove digits
    # doc = "".join([w for w in list(doc) if not w in chiffres])
    # Tokenize the document
    doc = word_tokenize(doc)
    # Lemmatize each term
    doc = [lem.lemmatize(terme) for terme in doc]
    # Remove stopwords
    doc = [w for w in doc if not w in mots_vides]
    # Remove terms with less than 3 characters if they are not numeric
    doc = [w for w in doc if len(w) >= 3 or w.isnumeric()]
    return doc

def calculate_similarity(text1: str, text2: str) -> float:
    # Clean and tokenize the texts
    doc1 = nettoyage_doc(text1)
    doc2 = nettoyage_doc(text2)
    
    # Join tokens back to strings
    text1 = " ".join(doc1)
    text2 = " ".join(doc2)
    
    print(f"Texte 1 nettoyé : {text1}")
    print(f"Texte 2 nettoyé : {text2}")

    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0]  # Return the similarity score (0 to 1)

# Example texts
text1 = "Bonjour, je suis un texte de test"
text2 = "Bonjour, je suis un autre texte de test"

# Calculate similarity
similarity_score = calculate_similarity(text1, text2)

print(f"La similarité entre les deux textes est : {similarity_score:.2f}")

Texte 1 nettoyé : bonjour texte test
Texte 2 nettoyé : bonjour autre texte test
La similarité entre les deux textes est : 0.78
