In [None]:
!pip install rapidfuzz
# Solo es necesario si no las tienes ya
!pip install kmodes plotly



In [None]:
# — Manipulación de datos —
import pandas as pd
import numpy as np

# — Cálculos de distancia y clustering jerárquico —
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# — Métricas de validación de clustering —
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score
)

# — Clustering “usuales” en Python —
from sklearn.cluster import (
    KMeans,
    AgglomerativeClustering,
    SpectralClustering
)
from kmodes.kprototypes import KPrototypes  # si usas clustering mixto

# — Reducción de dimensión y proyecciones —
from sklearn.manifold import MDS
from sklearn.decomposition import PCA

# — Normalización y transformaciones —
from sklearn.preprocessing import normalize, MinMaxScaler

# — Visualización —
import matplotlib.pyplot as plt
import plotly.express as px


In [None]:
# Ruta al archivo (ajústala si tu entorno difiere)
data = '/content/drive/MyDrive/proyecto_ingeneria/df.xlsx'

# 1. Leer el Excel
data= pd.read_excel(data)

data


In [None]:
# Ruta al archivo (ajústala si tu entorno difiere)
classification_locations = pd.read_csv('/content/drive/MyDrive/proyecto_ingeneria/Classification_locations.csv', sep=';')


# 1. Leer el Excel


classification_locations

In [None]:


import os
import sys
import csv
import time
import pickle
import random
import operator
import datetime
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Tuple, Dict
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering
import plotly.express as px
import plotly.graph_objs as go
from sklearn.metrics import pairwise_distances, silhouette_score, silhouette_samples


In [None]:
#!pip install -U kaleido
!pip install -U plotly
!pip install kaleido==0.2.1


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import pickle as pick
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances, silhouette_score, silhouette_samples
from sklearn.manifold import MDS
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import plotly.express as px

class DatabasePreparation:
    def __init__(self):
        self.data = None

    def loadingDatabase(self, url):
        self.data = pd.read_excel(url, engine="openpyxl", dtype=str)
        self.data = self.data[[
            "user", "memberID", "user_location", "date_review",
            "nom", "reviewed_category", "reviewed_location",
            "comment", "score", "country", "id"
        ]]
        print(self.data.head())
        print(self.data.info())
        print(self.data.isnull().sum() * 100 / len(self.data))

    def preprocessing(self):
        self.data["date_review"] = pd.to_datetime(self.data["date_review"], errors="coerce")
        self.data = self.data.dropna(subset=["date_review", "memberID", "id"])
        self.data = self.data.reset_index(drop=True)
        print(self.data.head())
        print(self.data.info())
        print(self.data.isnull().sum() * 100 / len(self.data))

    def filterTopLocations(self, seuil=20):
        return self.data["id"].value_counts().nlargest(seuil).index.tolist()

class LocationsClassification:
    def __init__(self, path):
        self.path = path
        self.classification = None

    def load(self):
        self.classification = pd.read_csv(self.path, sep=";")
        self.classification = self.classification[["id", "nom", "categorie"]]
        print(self.classification.head())
        print(self.classification.info())

    def check_match(self, top_ids):
        classified_ids = set(self.classification["id"].astype(str))
        missing = set(map(str, top_ids)) - classified_ids
        if not missing:
            print("La diferencia entre la lista de monumentos clasificados y el top es nula")
        else:
            raise ValueError(f"IDs faltantes en clasificación: {missing}")

class Seasonality:
    def detect(self, date):
        m, d = date.month, date.day
        if (m == 12 and d >= 21) or (1 <= m <= 2) or (m == 3 and d < 20):
            return "Winter"
        elif (m == 3 and d >= 20) or (4 <= m <= 5) or (m == 6 and d < 21):
            return "Spring"
        elif (m == 6 and d >= 21) or (7 <= m <= 8) or (m == 9 and d < 22):
            return "Summer"
        else:
            return "Autumn"

class TripsList:
    def __init__(self, data):
        self.data = data

    def create_trips(self, top_ids, threshold=15):
        trips_dict = {}
        data_top = self.data[self.data["id"].isin(top_ids)].copy()
        for user, group in data_top.groupby("memberID"):
            group = group.sort_values("date_review")
            user_trips = []
            curr_trip = []
            last_date = None
            for _, row in group.iterrows():
                if last_date is None or (row["date_review"] - last_date).days <= threshold:
                    curr_trip.append((row["date_review"], row["id"]))
                else:
                    if len(curr_trip) > 1:
                        user_trips.append(curr_trip)
                    curr_trip = [(row["date_review"], row["id"])]
                last_date = row["date_review"]
            if len(curr_trip) > 1:
                user_trips.append(curr_trip)
            if user_trips:
                trips_dict[user] = user_trips
        return trips_dict

def construir_list_trips(trips_dict):
    seasonality = Seasonality()
    list_trips = {}
    trip_id = 0
    for user, trips in trips_dict.items():
        for trip in trips:
            locs = [t[1] for t in trip]
            dur = (trip[-1][0] - trip[0][0]).days
            seas = seasonality.detect(trip[0][0])
            list_trips[trip_id] = (locs, seas, dur, user)
            trip_id += 1
    print(f"\nIl y a {len(list_trips)} trips dans la base de données")
    return list_trips

def construir_base_numerica(list_trips):
    all_locations = sorted(set(loc for v in list_trips.values() for loc in v[0]))
    loc_idx = {loc: i for i, loc in enumerate(all_locations)}
    rows = []
    for locs, season, dur, _ in list_trips.values():
        vec = np.zeros(len(all_locations))
        for loc in locs:
            vec[loc_idx[loc]] += 1
        season_code = hash(season) % 1000
        row = np.concatenate([vec, [season_code, dur]])
        rows.append(row)
    df = pd.DataFrame(rows)
    print("\n--- database_num.head() ---\n", df.head())
    print("\n--- database_num.info() ---\n", df.info())
    return df

def main():
    path_data = "/content/drive/MyDrive/proyecto_ingeneria/df.xlsx"
    path_class = "/content/drive/MyDrive/proyecto_ingeneria/Classification_locations.csv"
    region = "america"
    a, b = 0.8, 0.2
    seuil_top = 20
    threshold_trip = 7
    output_dir = Path(f"res/{region}/todos_los_anios")
    output_dir.mkdir(parents=True, exist_ok=True)

    db = DatabasePreparation()
    db.loadingDatabase(path_data)
    db.preprocessing()
    data = db.data

    cl = LocationsClassification(path_class)
    cl.load()
    top_ids = db.filterTopLocations(seuil_top)
    cl.check_match(top_ids)

    trips = TripsList(data)
    trips_dict = trips.create_trips(top_ids, threshold_trip)
    list_trips = construir_list_trips(trips_dict)
    database_num = construir_base_numerica(list_trips)

    content_matrix = pairwise_distances(database_num.iloc[:, :-2])
    context_matrix = pairwise_distances(database_num.iloc[:, -2:])
    combined = normalize(a * content_matrix + b * context_matrix, axis=1, norm="l1")

    with open(output_dir / f"{a:.1f}_{b:.1f}.pkl", "wb") as f:
        pick.dump(combined, f)

    # ======== Silhouette test automático para k = 2 to 100 =========
    print("📈 Generando gráfico de silhouette…")
    silhouette_scores = []
    k_range = range(2, 101)
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=0).fit(combined)
        score = silhouette_score(combined, kmeans.labels_)
        silhouette_scores.append(score)

    plt.figure(figsize=(10, 5))
    plt.plot(k_range, silhouette_scores, marker='o', linestyle='-')
    plt.xlabel("k")
    plt.ylabel("silhouette")
    plt.title("Silhouette Score vs k")
    plt.grid(True)
    plt.savefig(output_dir / f"silhouette_k_comparison.png")
    plt.show()

    # ======== Preguntar valor final de k ========
    while True:
        try:
            k = int(input("Quel est la valeur de k ? "))
            if k > 1:
                break
        except:
            print("Entrada inválida.")

    kmeans = KMeans(n_clusters=k, random_state=0).fit(combined)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(combined, labels)
    silhouette_vals = silhouette_samples(combined, labels)

    # Silhouette final
    fig_sil = px.bar(
        x=np.arange(len(silhouette_vals)),
        y=silhouette_vals,
        color=labels.astype(str),
        title=f"Silhouette plot (a={a}, b={b})"
    )
    fig_sil.write_html(output_dir / f"silhouette_{a:.1f}_{b:.1f}.html")
    fig_sil.write_image(output_dir / f"silhouette_{a:.1f}_{b:.1f}.png")

    # Proyección MDS
    mds = MDS(random_state=0, dissimilarity='precomputed')
    last_combined = combined + combined.T - np.diag(np.diag(combined))
    projection = mds.fit_transform(last_combined)

    fig_proj = px.scatter(x=projection[:, 0], y=projection[:, 1],
                          color=labels.astype(str),
                          title=f"Proyección (a={a:.1f}, b={b:.1f})")
    fig_proj.write_html(output_dir / f"{a:.1f}_{b:.1f}_projection.html")

    df_clusters = pd.DataFrame({"trip_id": range(len(labels)), "cluster": labels})
    df_clusters.to_csv(output_dir / f"{a:.1f}_{b:.1f}_clusters.csv", index=False)
    df_clusters.to_excel(output_dir / f"{a:.1f}_{b:.1f}_clusters.xlsx", index=False)

    with open(output_dir / f"{a:.1f}_{b:.1f}_clusters.pkl", "wb") as f:
        pick.dump(labels, f)

    with open(output_dir / f"{a:.1f}_{b:.1f}_summary.pkl", "wb") as f:
        pick.dump({
            "a": a, "b": b, "k": k, "silhouette": silhouette_avg
        }, f)

    database_num.to_csv(output_dir / "base_numerica.csv", index=False)
    with open(output_dir / "list_trips.pkl", "wb") as f:
        pick.dump(list_trips, f)

    print(f"\n✅ Silhouette final: {silhouette_avg:.4f}")
    print("🎉 TODOS LOS RESULTADOS GUARDADOS.")
    print("DONE ✅")

if __name__ == "__main__":
    main()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict

def generar_visualizacion_global(region="america", a=0.8, b=0.2):
    base_path = Path("res") / region / "todos_los_anios"
    output_dir = Path("analisis") / "todos_los_anios"
    output_dir.mkdir(parents=True, exist_ok=True)

    try:
        print("📊 Procesando visualización global para todos los años…")

        df_trips = pd.read_pickle(base_path / "list_trips.pkl")
        df_clusters = pd.read_csv(base_path / f"{a:.1f}_{b:.1f}_clusters.csv")
        classification = pd.read_csv("/content/drive/MyDrive/proyecto_ingeneria/Classification_locations.csv", sep=";")

        df_trips_df = pd.DataFrame.from_dict(
            df_trips, orient="index",
            columns=["locations", "season", "duration", "user"]
        ).reset_index().rename(columns={'index': 'trip_id'})

        df_trips_df["cluster"] = df_clusters["cluster"]

        # ========== SEASONALITY ========== (sin cambios)
# ========== SEASONALITY ========== (corregido)
        season_order = ["Summer", "Autumn", "Spring", "Winter"]
        pivot_season = pd.crosstab(df_trips_df["cluster"], df_trips_df["season"], normalize='index') * 100

# Asegurar que todas las estaciones estén como columnas (aunque sean 0%)
        for season in season_order:
          if season not in pivot_season.columns:
            pivot_season[season] = 0

        pivot_season = pivot_season[season_order]

        cluster_order = sorted(df_trips_df["cluster"].unique())  # orden consistente de clusters
        pivot_season = pivot_season.loc[cluster_order]  # aplica orden


        fig, ax = plt.subplots(figsize=(10, 3))
        pivot_season.plot(kind='bar', stacked=True, ax=ax, colormap="tab20c")
        ax.set_ylabel("Percentage")
        ax.set_xlabel("Cluster")
        ax.legend(title="Season", bbox_to_anchor=(1.05, 1), loc="upper left")
        plt.tight_layout()
        plt.savefig(output_dir / f"seasonality_{a:.1f}_{b:.1f}.png")
        plt.close()

        # ========== CATEGORIES ========== (solo este bloque está modificado)
        classification_categ = classification.copy()

        def normalizar_categoria(cat):
            cat = str(cat).lower().strip()
            if "shops/restaurants and bars/gastronomy" in cat:
                return "Shops/restaurants and bars/gastronomy"
            elif "urbanism" in cat:
                return "Urbanism"
            else:
                return cat.title()

        classification_categ["categorie"] = classification_categ["categorie"].apply(normalizar_categoria)
        id_to_cat = dict(zip(classification_categ["id"].astype(str), classification_categ["categorie"]))

        cluster_cat_counts = defaultdict(lambda: defaultdict(int))
        for trip_id, (locs, _, _, _) in df_trips.items():
            clus = df_clusters.loc[trip_id, "cluster"]
            for loc in locs:
                cat = id_to_cat.get(str(loc), "Unknown")
                cluster_cat_counts[clus][cat] += 1

        cat_df = pd.DataFrame(cluster_cat_counts).fillna(0).sort_index(axis=1).sort_index()
        cat_pct = cat_df.div(cat_df.sum(axis=0), axis=1) * 100

        fig, ax = plt.subplots(figsize=(11, 4))
        cat_pct.T.plot(kind='bar', stacked=True, ax=ax, colormap="tab20")
        ax.set_ylabel("Percentage")
        ax.set_xlabel("Cluster")
        ax.legend(title="Category", bbox_to_anchor=(1.05, 1), loc="upper left")
        plt.tight_layout()
        plt.savefig(output_dir / f"categories_{a:.1f}_{b:.1f}.png")
        plt.close()

        # ========== RESUMEN ==========
        resumen = df_trips_df.groupby("cluster").agg({
            "duration": ["mean", "std", "count"],
            "user": "count"
        })
        resumen.columns = ["duration_mean", "duration_std", "n_stays", "n_reviews"]
        resumen["reviews_per_stay"] = resumen["n_reviews"] / resumen["n_stays"]

        resumen.loc["Dataset"] = [
            df_trips_df["duration"].mean(),
            df_trips_df["duration"].std(),
            df_trips_df.shape[0],
            df_trips_df["user"].count(),
            df_trips_df["user"].count() / df_trips_df.shape[0]
        ]

        def fmt(mean, std):
            return f"{mean:.3g} ± {std:.2g}" if pd.notnull(std) else f"{mean:.3g} ± 0"

        table_fmt = pd.DataFrame({
            "Average duration + std": resumen.apply(lambda x: fmt(x["duration_mean"], x["duration_std"]), axis=1),
            "No. of stays + %": resumen["n_stays"].apply(lambda x: f"{int(x)}") + " ± " +
                                 resumen["n_stays"].div(resumen.loc["Dataset", "n_stays"]).mul(100).round(1).astype(str),
            "Reviews per stay means + Std": resumen.apply(lambda x: fmt(x["reviews_per_stay"], 0), axis=1)
        })

        fig, ax = plt.subplots(figsize=(8, 4))
        ax.axis('off')
        tbl = ax.table(cellText=table_fmt.values,
                       colLabels=table_fmt.columns,
                       rowLabels=table_fmt.index,
                       loc='center')
        tbl.auto_set_font_size(False)
        tbl.set_fontsize(9)
        plt.tight_layout()
        plt.savefig(output_dir / f"resumen_{a:.1f}_{b:.1f}.png")
        plt.close()

        print("✅ Visualizaciones globales generadas correctamente.")

    except Exception as e:
        print(f"⚠️ Error en visualización global: {e}")


In [None]:
generar_visualizacion_global()


In [None]:
!pip install umap-learn


In [None]:
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pathlib import Path

def visualizar_clusters_umap(region="america", a=0.8, b=0.2):
    input_dir = Path("res") / region / "todos_los_anios"
    output_dir = Path("analisis") / "todos_los_anios"
    output_dir.mkdir(parents=True, exist_ok=True)

    try:
        print("📊 Generando proyección UMAP…")
        database_num = pd.read_csv(input_dir / "base_numerica.csv")
        df_clusters = pd.read_csv(input_dir / f"{a:.1f}_{b:.1f}_clusters.csv")

        reducer = umap.UMAP(random_state=42)
        projection = reducer.fit_transform(database_num)

        df_plot = pd.DataFrame(projection, columns=["x", "y"])
        df_plot["cluster"] = df_clusters["cluster"].astype(str)

        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=df_plot, x="x", y="y", hue="cluster", palette="tab10", s=10, alpha=0.7)
        plt.title(f"Visualización de clusters con UMAP (a={a}, b={b})")
        plt.tight_layout()
        plt.savefig(output_dir / f"umap_clusters_{a:.1f}_{b:.1f}.png")
        plt.show()

        print("✅ Gráfico de clusters generado y guardado.")

    except Exception as e:
        print(f"⚠️ Error generando gráfico UMAP: {e}")



In [None]:
visualizar_clusters_umap()



In [None]:
!pip install wordcloud
!python -m spacy download es_core_news_sm


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.cluster import KMeans

# Leer datos
region = "america"
a, b = 0.8, 0.2
input_dir = Path("res") / region / "todos_los_anios"

# Cargar proyección UMAP y clusters
database_num = pd.read_csv(input_dir / "base_numerica.csv")
df_clusters = pd.read_csv(input_dir / f"{a:.1f}_{b:.1f}_clusters.csv")

# Reducir dimensiones con UMAP
import umap
reducer = umap.UMAP(random_state=42)
projection = reducer.fit_transform(database_num)

# Crear DataFrame para análisis
df_plot = pd.DataFrame(projection, columns=["x", "y"])
df_plot["cluster"] = df_clusters["cluster"].astype(str)

# Agrupar clusters usando KMeans sobre la proyección UMAP
kmeans_vis = KMeans(n_clusters=4, random_state=42)
df_plot["group"] = kmeans_vis.fit_predict(df_plot[["x", "y"]])

# Listar clusters por grupo
group_to_clusters = df_plot.groupby("group")["cluster"].unique().to_dict()
group_to_clusters


In [None]:
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pathlib import Path

def visualizar_clusters_umap(region="america", a=0.8, b=0.2):
    input_dir = Path("res") / region / "todos_los_anios"
    output_dir = Path("analisis") / "todos_los_anios"
    output_dir.mkdir(parents=True, exist_ok=True)

    try:
        print("📊 Generando proyección UMAP…")
        database_num = pd.read_csv(input_dir / "base_numerica.csv")
        df_clusters = pd.read_csv(input_dir / f"{a:.1f}_{b:.1f}_clusters.csv")

        reducer = umap.UMAP(random_state=42)
        projection = reducer.fit_transform(database_num)

        df_plot = pd.DataFrame(projection, columns=["x", "y"])
        df_plot["cluster"] = df_clusters["cluster"].astype(str)

        plt.figure(figsize=(12, 8))  # gráfico más grande
        sns.scatterplot(data=df_plot, x="x", y="y", hue="cluster", palette="tab10", s=10, alpha=0.7)

        # Ajustes para expandir visualmente el área
        margin = 2
        plt.xlim(df_plot["x"].min() - margin, df_plot["x"].max() + margin)
        plt.ylim(df_plot["y"].min() - margin, df_plot["y"].max() + margin)

        plt.title(f"Visualización de clusters con UMAP (a={a}, b={b})")
        plt.tight_layout()
        plt.savefig(output_dir / f"umap_clusters_{a:.1f}_{b:.1f}.png")
        plt.show()

        print("✅ Gráfico de clusters generado y guardado.")

    except Exception as e:
        print(f"⚠️ Error generando gráfico UMAP: {e}")


In [None]:
visualizar_clusters_umap()


In [None]:
# Reimportar librerías tras reinicio del entorno
import pandas as pd
import pickle as pick
from pathlib import Path

# Cargar paths
region = "america"
a, b = 0.8, 0.2
base_path = Path("res") / region / "todos_los_anios"

# Cargar archivos necesarios nuevamente
with open(base_path / "list_trips.pkl", "rb") as f:
    list_trips = pick.load(f)

df_clusters = pd.read_csv(base_path / f"{a:.1f}_{b:.1f}_clusters.csv")
df_original = pd.read_excel("/content/drive/MyDrive/proyecto_ingeneria/df.xlsx", engine="openpyxl", dtype=str)
df_original["date_review"] = pd.to_datetime(df_original["date_review"], errors="coerce")

# Crear DataFrame de estancias
records = []
for trip_id, (locations, season, duration, user) in list_trips.items():
    user_reviews = df_original[(df_original["memberID"] == user) & (df_original["id"].isin(locations))]
    for _, row in user_reviews.iterrows():
        records.append({
            "trip_id": trip_id,
            "memberID": row["memberID"],
            "country": row["country"],
            "reviewed_location": row["reviewed_location"],
            "id": row["id"],
            "cluster": df_clusters.loc[trip_id, "cluster"]
        })

df_stays = pd.DataFrame(records)

# Guardar Excel
output_path = base_path / f"estancias_clusterizadas.xlsx"
df_stays.to_excel(output_path, index=False)


