In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.cloud import storage
import hashlib

def generate_md5(value):
    return hashlib.md5(value.encode()).hexdigest()

def download_from_gcs(bucket_name, files, destination_folder):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    os.makedirs(destination_folder, exist_ok=True)
    
    for file in files:
        blob = bucket.blob(file)
        destination_path = os.path.join(destination_folder, os.path.basename(file))
        blob.download_to_filename(destination_path)
        print(f"📥 Descargado: {file}")

def load_dataframes(destination_folder):
    dataframes = {}
    for file in os.listdir(destination_folder):
        if file.endswith('.csv'):
            df_name = file.replace('.csv', '_cleaned')
            dataframes[df_name] = pd.read_csv(os.path.join(destination_folder, file))
    return dataframes

def convert_types(dataframes):
    for name, df in dataframes.items():
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].astype(str)
    return dataframes

def transform_data(dataframes):
    users_cleaned = dataframes['users_cleaned'][['user_id', 'name', 'review_count', 'yelping_since']]
    users_cleaned['yelping_since'] = pd.to_datetime(users_cleaned['yelping_since']).dt.date
    
    reviews_cleaned = dataframes['reviews_cleaned'][['review_id', 'business_id', 'user_id', 'stars', 'text', 'date']]
    reviews_cleaned.rename(columns={'date': 'review_date'}, inplace=True)
    reviews_cleaned['review_date'] = pd.to_datetime(reviews_cleaned['review_date']).dt.date
    reviews_cleaned['stars'] = reviews_cleaned['stars'].astype(int)
    
    business_cleaned = dataframes['business_cleaned'][['business_id', 'name', 'address', 'city', 'categories', 'latitude', 'longitude', 'review_count']]
    business_cleaned.rename(columns={'name': 'business_name'}, inplace=True)
    
    cities = business_cleaned[['city']].drop_duplicates().copy()
    cities['city_id'] = cities['city'].apply(generate_md5)
    business_cleaned = business_cleaned.merge(cities, on='city', how='left').drop(columns=['city'])
    cities = cities[['city_id', 'city']]
    
    categories = business_cleaned[['categories']].drop_duplicates().copy()
    categories['category_id'] = categories['categories'].apply(generate_md5)
    business_cleaned = business_cleaned.merge(categories, on='categories', how='left')
    categories = categories[['category_id', 'categories']]
    categories.rename(columns={'categories': 'category'}, inplace=True)
    
    return {
        'users_cleaned': users_cleaned,
        'reviews_cleaned': reviews_cleaned,
        'business_cleaned': business_cleaned,
        'cities': cities,
        'categories': categories
    }

def plot_and_export(dataframes, output_path, bucket):
    os.makedirs(output_path, exist_ok=True)
    dataframes = {k: v for k, v in dataframes.items() if v is not None}
    
    for name, df in dataframes.items():
        if not df.empty:
            numeric_df = df.select_dtypes(include=[np.number])
            for col in numeric_df.columns:
                plt.figure(figsize=(10, 5))
                sns.histplot(numeric_df[col], bins=30, kde=True)
                plt.axvline(numeric_df[col].mean(), color='r', linestyle='dashed', linewidth=2, label='Media')
                plt.title(f'Distribuci\u00f3n de {col} en {name}')
                plt.legend()
                plt.show()
            
            csv_path = os.path.join(output_path, f"{name}.csv")
            df.to_csv(csv_path, index=False)
            
            blob = bucket.blob(f"ETL/{name}.csv")
            blob.upload_from_filename(csv_path)
            print(f"☁️ Archivo subido a GCS: ETL/{name}.csv")

def main():
    bucket_name = "tu_bucket"
    files = ["users.csv", "reviews.csv", "business.csv"]
    destination_folder = "data"
    output_path = "output"
    
    download_from_gcs(bucket_name, files, destination_folder)
    dataframes = load_dataframes(destination_folder)
    dataframes = convert_types(dataframes)
    transformed_data = transform_data(dataframes)
    
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    plot_and_export(transformed_data, output_path, bucket)
    
if __name__ == "__main__":
    main()
