In [10]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import storage

def download_from_gcs(bucket_name, files, destination_folder):
    os.makedirs(destination_folder, exist_ok=True)
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    
    for file in files:
        blob = bucket.blob(file)
        file_path = os.path.join(destination_folder, os.path.basename(file))
        blob.download_to_filename(file_path)
        print(f"✅ Descargado: {file} → {file_path}")

def load_dataframes(destination_folder):
    dataframes = {}
    
    for file in os.listdir(destination_folder):
        file_path = os.path.join(destination_folder, file)
        
        if file.endswith(".parquet"):
            try:
                print(f"📂 Convirtiendo Parquet a CSV: {file}")
                df = pd.read_parquet(file_path)
                csv_file = file.replace(".parquet", ".csv")
                csv_path = os.path.join(destination_folder, csv_file)
                df.to_csv(csv_path, index=False)
                print(f"✅ Archivo convertido: {csv_file}")
            except Exception as e:
                print(f"⚠️ Error al convertir {file}: {e}")
        
        elif file.endswith(".csv"):
            try:
                print(f"📂 Cargando: {file}")  
                df = pd.read_csv(file_path)
                dataframes[file.replace(".csv", "")] = df
            except pd.errors.EmptyDataError:
                print(f"⚠️ Archivo vacío o sin columnas: {file}")
            except Exception as e:
                print(f"⚠️ Error al cargar {file}: {e}")

    print("✅ Archivos cargados en DataFrames.")
    return dataframes


def convert_types(dataframes):
    for name, df in dataframes.items():
        for col in df.columns:
            if df[col].dtype == "object":
                # Detectar si la columna debería ser fecha
                if "date" in col.lower() or "since" in col.lower():
                    df[col] = pd.to_datetime(df[col], errors="coerce")
                # Si no es fecha, mantenerla como string
                else:
                    df[col] = df[col].astype(str)

            # Convertir a category si tiene pocos valores únicos
            if df[col].dtype == "object" and df[col].nunique() < len(df) * 0.5:
                df[col] = df[col].astype("category")

    print("✅ Tipos de datos corregidos correctamente.")


def process_missing_values(dataframes):
    for name, df in dataframes.items():
        for col in df.select_dtypes(include=[np.number]).columns:
            df[col] = df[col].fillna(df[col].mean())
    print("✅ Valores nulos tratados correctamente.")

def transform_for_dw(dataframes):
    business_cleaned = dataframes["business_cleaned"].copy()

    # Extraer solo la primera categoría
    business_cleaned["category"] = business_cleaned["categories"].str.split(", ").str[0]

    # Crear tabla única de categorías
    unique_categories = business_cleaned[["category"]].drop_duplicates().reset_index(drop=True)
    unique_categories["category_id"] = range(1, len(unique_categories) + 1)

    # Asignar category_id a cada negocio
    business_cleaned = business_cleaned.merge(unique_categories, on="category", how="left")

    transformed = {
        "dim_category": unique_categories,
        "dim_city": business_cleaned[["city_id", "city"]].drop_duplicates(),
        "dim_business": business_cleaned[["business_id", "name", "address", "city_id", "category_id", "latitude", "longitude", "review_count"]],
        "fact_reviews": dataframes["review_cleaned"][["review_id", "business_id", "user_id", "date", "stars", "text"]],
        "dim_user": dataframes["users_cleaned"][["user_id", "name", "review_count", "yelping_since"]],
        "fact_checkin": dataframes["checkins_expanded"][["checkin_id", "business_id", "checkin_date", "checkin_count"]]
    }

    print("✅ DataFrames transformados para proyecto_dw.")
    return transformed


def plot_and_export(dataframes, output_path, bucket):
    os.makedirs(output_path, exist_ok=True)
    for name, df in dataframes.items():
        if not df.empty:
            numeric_df = df.select_dtypes(include=[np.number])
            for col in numeric_df.columns:
                plt.figure(figsize=(10, 5))
                sns.histplot(numeric_df[col], bins=30, kde=True)
                plt.axvline(numeric_df[col].mean(), color='r', linestyle='dashed', linewidth=2, label='Media')
                plt.title(f'Distribución de {col} en {name}')
                plt.legend()
                plt.show()
            
            csv_path = os.path.join(output_path, f"{name}.csv")
            df.to_csv(csv_path, index=False)
            
            blob = bucket.blob(f"ETL/{name}.csv")
            blob.upload_from_filename(csv_path)
            print(f"☁️ Archivo subido a GCS: ETL/{name}.csv")






In [None]:
# Configurar la autenticación con la clave de servicio JSON
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "proyectofinalgogleyelp-41e96ec7a40a.json"

# Configuración
bucket_name = "dataset-pf-gyelp"
destination_folder = "./dataWorkingon"
output_path = "./output_data"

files = [
    "Yelp/processed/user_cleaned.csv",
    "Yelp/processed/reviews_cleaned.csv",
    "Yelp/processed/users_cleaned.csv",
    "Yelp/processed/tips_cleaned.csv",
    "Yelp/processed/review_cleaned.csv",
    "Yelp/processed/business_cleaned.csv",
    "Yelp/processed/business_cleaned.parquet"
]

# Inicializar cliente de almacenamiento
client = storage.Client()
bucket = client.bucket(bucket_name)

# Proceso ETL
download_from_gcs(bucket_name, files, destination_folder)



✅ Descargado: Yelp/processed/user_cleaned.csv → ./dataWorkingon\user_cleaned.csv
✅ Descargado: Yelp/processed/reviews_cleaned.csv → ./dataWorkingon\reviews_cleaned.csv


In [None]:
def load_file(file_path):
    if os.path.exists(file_path):  
        try:
            if file_path.endswith('.csv'):
                print(f"📂 Cargando CSV: {file_path}")
                return pd.read_csv(file_path, encoding='utf-8')  # Prueba con 'latin1' si falla
            elif file_path.endswith('.parquet'):
                print(f"📂 Cargando Parquet: {file_path}")
                return pd.read_parquet(file_path)
            else:
                print(f"⚠️ Formato no soportado: {file_path}")
                return None
        except Exception as e:
            print(f"❌ Error al cargar {file_path}: {e}")
            return None  
    else:
        print(f"⚠️ Archivo no encontrado: {file_path}")
        return None

# Cargar archivos
dataframes = {file: load_file(os.path.join(destination_folder, file)) for file in files}
# ✅ Revisar qué archivos se cargaron correctamente
for file, df in dataframes.items():
    if df is not None:
        print(f"✔️ {file} cargado con {len(df)} filas")


In [None]:
dataframes = load_dataframes(destination_folder)


In [None]:
convert_types(dataframes)


In [None]:
process_missing_values(dataframes)


In [None]:
def check_columns(dataframes):
    for name, df in dataframes.items():
        print(f"\n🔍 {name} - Columnas y tipos de datos:")
        print(df.dtypes)
        print("-" * 50)

In [None]:
check_columns(dataframes)


In [None]:
transformed_dataframes = transform_for_dw(dataframes)

In [None]:
# Exportar
client = storage.Client()
bucket = client.bucket(bucket_name)
plot_and_export(transformed_dataframes, output_path, bucket)