In [1]:
import random
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
import pandas as pd
import json
from pyspark.ml.feature import StringIndexer
from bson.objectid import ObjectId
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import numpy as np

# --- Chargement des données ---
class DataLoader:
    def __init__(self, places_path, reviews_path):
        """
        Initialise le chargeur de données avec l'ensemble des fichiers JSON.
        """
        # Charger les données des lieux
        with open(places_path, 'r') as f:
            self.places_data = json.load(f)

        # Charger les données des avis
        with open(reviews_path, 'r') as f:
            self.reviews_data = json.load(f)

    def get_places(self):
        return self.places_data

    def get_reviews(self):
        return self.reviews_data


# --- Génération des coordonnées ---
def generate_coordinates():
    """
    Génère une localisation aléatoire aux États-Unis.
    """
    min_lat, max_lat = 24.396308, 49.384358  # Du sud de la Floride à la frontière canadienne
    min_lon, max_lon = -125.0, -66.93457    # De la côte Pacifique à l'Atlantique

    lat = random.uniform(min_lat, max_lat)
    lon = random.uniform(min_lon, max_lon)
    return round(lat, 4), round(lon, 4)


# --- Gestion des lieux proches ---
def get_nearby_places(user_lat, user_lon, max_distance_km, places_data):
    lat_delta = max_distance_km / 111  # 1 degré de latitude = ~111 km
    lon_delta = max_distance_km / (111 * abs(user_lat))

    min_lat = user_lat - lat_delta
    max_lat = user_lat + lat_delta
    min_lon = user_lon - lon_delta
    max_lon = user_lon + lon_delta

    nearby_places = [
        place for place in places_data
        if min_lat <= place['latitude'] <= max_lat and min_lon <= place['longitude'] <= max_lon
    ]

    return [place["business_id"] for place in nearby_places]


# --- Gestion des avis ---
def get_reviews_for_places(place_ids, reviews_data):
    return [review for review in reviews_data if review["business_id"] in place_ids]


def get_places_from_ids(place_ids, places_data):
    return [place for place in places_data if place["business_id"] in place_ids]


# --- Filtrage collaboratif ---
def collaborative_filtering(user_lat, user_lon, max_distance_km, places_data, reviews_data):
    # Étape 1 : Récupérer les lieux à proximité
    nearby_places = get_nearby_places(user_lat, user_lon, max_distance_km, places_data)
    if not nearby_places:
        return {"recommendations": [], "rmse": None, "r_squared": None, "adjusted_r_squared": None}

    # Étape 2 : Filtrer les avis pour les lieux à proximité
    reviews = get_reviews_for_places(nearby_places, reviews_data)
    if not reviews:
        return {"recommendations": [], "rmse": None, "r_squared": None, "adjusted_r_squared": None}

    # Étape 3 : Créer une matrice utilisateur-élément
    reviews_df = pd.DataFrame(reviews)
    reviews_df = reviews_df[["user_id", "business_id", "stars"]]

    # Encoder les colonnes user_id et business_id
    user_encoder = LabelEncoder()
    business_encoder = LabelEncoder()
    reviews_df["user_index"] = user_encoder.fit_transform(reviews_df["user_id"])
    reviews_df["business_index"] = business_encoder.fit_transform(reviews_df["business_id"])

    # Créer une matrice utilisateur-élément (rows: users, cols: businesses)
    user_item_matrix = reviews_df.pivot_table(
        index="user_index", columns="business_index", values="stars", fill_value=0
    )

    # Étape 4 : Calculer la similarité entre les utilisateurs
    user_similarity = cosine_similarity(user_item_matrix)

    # Étape 5 : Prédire les scores pour chaque utilisateur
    user_id = reviews_df["user_id"].iloc[0]  # On prend le premier utilisateur comme exemple
    user_idx = user_encoder.transform([user_id])[0]

    # Calculer les scores de recommandation pour l'utilisateur
    user_ratings = user_item_matrix.loc[user_idx]
    weighted_scores = user_similarity[user_idx].dot(user_item_matrix) / np.array(
        [np.abs(user_similarity[user_idx]).sum()]
    )

    # Calcul des métriques
    actual_ratings = user_ratings[user_ratings != 0]
    predicted_ratings = weighted_scores[user_ratings != 0]
    
    metrics = {"rmse": None, "r_squared": None, "adjusted_r_squared": None}
    
    if len(actual_ratings) > 0:
        # Calcul du RMSE
        mse = np.mean((actual_ratings - predicted_ratings) ** 2)
        metrics["rmse"] = np.sqrt(mse)
        
        # Calcul du R²
        ss_res = np.sum((actual_ratings - predicted_ratings) ** 2)
        ss_tot = np.sum((actual_ratings - np.mean(actual_ratings)) ** 2)
        if ss_tot != 0:
            metrics["r_squared"] = 1 - (ss_res / ss_tot)
        else:
            metrics["r_squared"] = 1.0  # Cas où toutes les valeurs sont identiques
        
        # Calcul du R² ajusté
        n = len(actual_ratings)
        p = 1  # Nombre de prédicteurs (simplifié)
        if n > p + 1 and metrics["r_squared"] is not None:
            metrics["adjusted_r_squared"] = 1 - (1 - metrics["r_squared"]) * (n - 1) / (n - p - 1)

    # Trier les scores prédits pour obtenir les meilleures recommandations
    top_business_indices = np.argsort(weighted_scores)[::-1][:5]

    # Convertir les indices des business en IDs
    recommended_business_ids = business_encoder.inverse_transform(top_business_indices)

    # Récupérer les lieux correspondants
    recommended_places = get_places_from_ids(recommended_business_ids, places_data)

    return {
        "recommendations": [place["name"] for place in recommended_places],
        **metrics
    }
    
# --- Exécution principale ---
if __name__ == "__main__":
    places_path = "/kaggle/input/big-data/places.json"
    reviews_path = "/kaggle/input/big-data/reviews.json"

    # Chargement des données complètes
    data_loader = DataLoader(places_path, reviews_path)
    places_data = data_loader.get_places()
    reviews_data = data_loader.get_reviews()

    user_lat, user_lon = 32.223236, -110.880452
    max_distance_km = 200

    # Recommandations basées sur le filtrage collaboratif
    result = collaborative_filtering(user_lat, user_lon, max_distance_km, places_data, reviews_data)
    print("Recommandations :", result["recommendations"])
    
    if result["rmse"] is not None:
        print(f"\nMétriques d'évaluation:")
        print(f"RMSE: {result['rmse']:.4f}")
        print(f"R²: {result['r_squared']:.4f}")
        if result['adjusted_r_squared'] is not None:
            print(f"R² ajustée: {result['adjusted_r_squared']:.4f}")
        else:
            print("R² ajustée: Non calculable (nombre d'échantillons insuffisant)")
    else:
        print("\nAucune métrique disponible (absence d'évaluations réelles)")

Recommandations : ['Pita Jungle', 'Fresco Pizzeria', 'Falora', 'Tucson Botanical Gardens', 'Nimbus Brewing']

Métriques d'évaluation:
RMSE: 3.8341
R²: 1
R² ajustée: 0.4


Recommandations : ['Pita Jungle', 'Fresco Pizzeria', 'Falora', 'Tucson Botanical Gardens', 'Nimbus Brewing']

Métriques d'évaluation:
RMSE: 3.8341
R²: -10.5324
R² ajustée: -11.2532
