In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import storage

def download_from_gcs(bucket_name, files, destination_folder):
    os.makedirs(destination_folder, exist_ok=True)
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    
    for file in files:
        blob = bucket.blob(file)
        file_path = os.path.join(destination_folder, os.path.basename(file))
        blob.download_to_filename(file_path)
        print(f"✅ Descargado: {file} → {file_path}")

def load_dataframes(destination_folder):
    dataframes = {}
    
    for file in os.listdir(destination_folder):
        file_path = os.path.join(destination_folder, file)
        
        if file.endswith(".parquet"):
            try:
                print(f"📂 Convirtiendo Parquet a CSV: {file}")
                df = pd.read_parquet(file_path)
                csv_file = file.replace(".parquet", ".csv")
                csv_path = os.path.join(destination_folder, csv_file)
                df.to_csv(csv_path, index=False)
                print(f"✅ Archivo convertido: {csv_file}")
            except Exception as e:
                print(f"⚠️ Error al convertir {file}: {e}")
        
        elif file.endswith(".csv"):
            try:
                print(f"📂 Cargando: {file}")  
                df = pd.read_csv(file_path)
                dataframes[file.replace(".csv", "")] = df
            except pd.errors.EmptyDataError:
                print(f"⚠️ Archivo vacío o sin columnas: {file}")
            except Exception as e:
                print(f"⚠️ Error al cargar {file}: {e}")

    print("✅ Archivos cargados en DataFrames.")
    return dataframes


def convert_types(dataframes):
    for name, df in dataframes.items():
        for col in df.columns:
            if df[col].dtype == "object":
                try:
                    df[col] = pd.to_datetime(df[col], format="%Y-%m-%d", errors="coerce")
                except:
                    pass
            if df[col].dtype == "object" and df[col].nunique() < len(df) * 0.5:
                df[col] = df[col].astype("category")
    print("✅ Tipos de datos convertidos correctamente.")

def process_missing_values(dataframes):
    for name, df in dataframes.items():
        for col in df.select_dtypes(include=[np.number]).columns:
            df[col] = df[col].fillna(df[col].mean())
    print("✅ Valores nulos tratados correctamente.")

def transform_for_dw(dataframes):
    transformed = {
        "dim_category": dataframes["business_cleaned"][["category_id", "category"]].drop_duplicates(),
        "dim_city": dataframes["business_cleaned"][["city_id", "city"]].drop_duplicates(),
        "dim_business": dataframes["business_cleaned"][["business_id", "business_name", "address", "city_id", "category_id", "latitude", "longitude", "review_count"]],
        "fact_reviews": dataframes["review_cleaned"][["review_id", "business_id", "user_id", "category_id", "review_date", "stars", "text"]],
        "dim_user": dataframes["users_cleaned"][["user_id", "name", "review_count", "yelping_since"]],
        "fact_checkin": dataframes["checkins_expanded"][["checkin_id", "business_id", "checkin_date", "checkin_count"]]
    }
    print("✅ DataFrames transformados para proyecto_dw.")
    return transformed

def plot_and_export(dataframes, output_path, bucket):
    os.makedirs(output_path, exist_ok=True)
    for name, df in dataframes.items():
        if not df.empty:
            numeric_df = df.select_dtypes(include=[np.number])
            for col in numeric_df.columns:
                plt.figure(figsize=(10, 5))
                sns.histplot(numeric_df[col], bins=30, kde=True)
                plt.axvline(numeric_df[col].mean(), color='r', linestyle='dashed', linewidth=2, label='Media')
                plt.title(f'Distribución de {col} en {name}')
                plt.legend()
                plt.show()
            
            csv_path = os.path.join(output_path, f"{name}.csv")
            df.to_csv(csv_path, index=False)
            
            blob = bucket.blob(f"ETL/{name}.csv")
            blob.upload_from_filename(csv_path)
            print(f"☁️ Archivo subido a GCS: ETL/{name}.csv")






In [2]:
# Configurar la autenticación con la clave de servicio JSON
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "proyectofinalgogleyelp-41e96ec7a40a.json"

# Configuración
bucket_name = "dataset-pf-gyelp"
destination_folder = "./dataWorkingon"
output_path = "./output_data"

files = [
    "Yelp/processed/user_cleaned.csv",
    "Yelp/processed/reviews_cleaned.csv",
    "Yelp/processed/users_cleaned.csv",
    "Yelp/processed/tips_cleaned.csv",
    "Yelp/processed/review_cleaned.csv",
    "Yelp/processed/business_cleaned.csv",
    "Yelp/processed/business_cleaned.parquet"
]

# Inicializar cliente de almacenamiento
client = storage.Client()
bucket = client.bucket(bucket_name)

# Proceso ETL
download_from_gcs(bucket_name, files, destination_folder)





✅ Descargado: Yelp/processed/user_cleaned.csv → ./dataWorkingon\user_cleaned.csv
✅ Descargado: Yelp/processed/reviews_cleaned.csv → ./dataWorkingon\reviews_cleaned.csv
✅ Descargado: Yelp/processed/users_cleaned.csv → ./dataWorkingon\users_cleaned.csv
✅ Descargado: Yelp/processed/tips_cleaned.csv → ./dataWorkingon\tips_cleaned.csv
✅ Descargado: Yelp/processed/review_cleaned.csv → ./dataWorkingon\review_cleaned.csv
✅ Descargado: Yelp/processed/business_cleaned.csv → ./dataWorkingon\business_cleaned.csv
✅ Descargado: Yelp/processed/business_cleaned.parquet → ./dataWorkingon\business_cleaned.parquet


In [None]:
# ✅ Definir la función primero
def load_csv(file_path):
    if os.path.exists(file_path):  # Verifica si el archivo existe
        return pd.read_csv(file_path)
    else:
        print(f"⚠️ Error: Archivo no encontrado -> {file_path}")
        return None  # Retorna None si el archivo no existe

# ✅ Luego, usarla para leer archivos
base_path = "dataWorkingon"

files = ["reviews_cleaned.csv",  "users_cleaned.csv",
         "tips_cleaned.csv", "tips_cleaned.csv", "review_cleaned.csv",  "user_cleaned.csv", "business_cleaned.parquet"]

# ✅ Asegurarse de que load_csv esté definida antes de usarla
dataframes = {file: load_csv(os.path.join(base_path, file)) for file in files}

# ✅ Revisar qué archivos se cargaron correctamente
for file, df in dataframes.items():
    if df is not None:
        print(f"✔️ {file} cargado con {len(df)} filas")


  return pd.read_csv(file_path)
  return pd.read_csv(file_path)


⚠️ Error: Archivo no encontrado -> dataWorkingon\business_cleaned
✔️ reviews_cleaned.csv cargado con 6990282 filas
✔️ users_cleaned.csv cargado con 2105597 filas
✔️ tips_cleaned.csv cargado con 908915 filas
✔️ review_cleaned.csv cargado con 4559049 filas
✔️ user_cleaned.csv cargado con 1987897 filas


In [None]:
dataframes = load_dataframes(destination_folder)


📂 Cargando: business_cleaned.csv
📂 Convirtiendo Parquet a CSV: business_cleaned.parquet
✅ Archivo convertido: business_cleaned.csv
📂 Cargando: reviews_cleaned.csv
📂 Cargando: review_cleaned.csv
📂 Cargando: tips_cleaned.csv
📂 Cargando: users_cleaned.csv


  df = pd.read_csv(file_path)


📂 Cargando: user_cleaned.csv


  df = pd.read_csv(file_path)


In [None]:
convert_types(dataframes)


✅ Tipos de datos convertidos correctamente.


In [None]:
process_missing_values(dataframes)


✅ Valores nulos tratados correctamente.


In [None]:
transformed_dataframes = transform_for_dw(dataframes)

KeyError: "None of [Index(['category_id', 'category'], dtype='object')] are in the [columns]"

In [None]:
# Exportar
client = storage.Client()
bucket = client.bucket(bucket_name)
plot_and_export(transformed_dataframes, output_path, bucket)