In [3]:
import requests
import csv
import os

# Cargar API Key desde una variable de entorno
GOOGLE_API_KEY = "AIzaSyBVFqawIp7wy2kTnY7ZVt9veOS7-yKfohU"
if not GOOGLE_API_KEY:
    print("❌ ERROR: No se encontró la clave de API. Configúrala en una variable de entorno.")
    exit()

# 🔹 Configurar URL y parámetros
API_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
PARAMS = {"query": "Restaurants in California", "key": GOOGLE_API_KEY}

# 🔹 Realizar la petición
response = requests.get(API_URL, params=PARAMS)
data = response.json()

# 🔹 Verificar si hay resultados
if "results" not in data or not data["results"]:
    print(f"⚠️ No se encontraron resultados. Mensaje de error: {data.get('error_message', 'Ninguno')}")
    exit()

# 🔹 Extraer datos relevantes
restaurants = []
for r in data["results"]:
    restaurants.append({
        "business_id": r.get("place_id"),
        "business_name": r.get("name"),
        "address": r.get("formatted_address"),
        "city": r.get("plus_code", {}).get("compound_code", "").split(" ")[-1] if r.get("plus_code") else "Unknown",
        "category": r["types"][0] if r.get("types") else "Unknown",
        "latitude": r["geometry"]["location"]["lat"],
        "longitude": r["geometry"]["location"]["lng"],
        "review_count": r.get("user_ratings_total", 0)
    })

# 🔹 Guardar en CSV
csv_filename = "google_restaurants.csv"
csv_columns = ["business_id", "business_name", "address", "city", "category", "latitude", "longitude", "review_count"]

with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
    writer.writeheader()
    writer.writerows(restaurants)

print(f"✅ CSV generado: {csv_filename} con {len(restaurants)} registros.")

✅ CSV generado: google_restaurants.csv con 20 registros.


In [None]:
import os
import requests
import csv
import time

def fetch_google_restaurants(location="California", pages=3):
    base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
    restaurants = []
    next_page_token = None
    
    for _ in range(pages):
        params = {
            "query": "restaurants in " + location,
            "key": "81MeibfW8d_OHhY9oRupbSWAy3cBuNrvOZJPNC3hQWkPa-ZFMIdDKeN3_pu6G8cVOvbR-1r8hIPxWqQV_u56wDQPPHr4kDve44nvtlxE7e3RMrq2M72jXM9ZXwTPZ3Yx",
            "type": "restaurant",
            "language": "es",
            "pagetoken": next_page_token if next_page_token else ""
        }
        
        response = requests.get(base_url, params=params)
        data = response.json()
        
        for result in data.get("results", []):
            restaurants.append({
                "Gmap_id": result.get("place_id"),
                "Nombre": result.get("name"),
                "Address": result.get("formatted_address"),
                "Avg_rating": result.get("rating"),
                "Reseñas": result.get("user_ratings_total"),
                "Latitud": result.get("geometry", {}).get("location", {}).get("lat"),
                "Longitud": result.get("geometry", {}).get("location", {}).get("lng"),
                "Fuente": "Google Maps"
            })
        
        next_page_token = data.get("next_page_token")
        if not next_page_token:
            break
        time.sleep(2)
    
    return restaurants

def fetch_yelp_restaurants(location="California"):
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization": f"Bearer AIzaSyBVFqawIp7wy2kTnY7ZVt9veOS7-yKfohU"}
    params = {
        "location": location,
        "term": "restaurants",
        "limit": 50
    }
    
    response = requests.get(url, headers=headers, params=params)
    data = response.json()
    
    return [{
        "Nombre": business.get("name"),
        "Address": ", ".join(business.get("location", {}).get("display_address", [])),
        "Avg_rating": business.get("rating"),
        "Reseñas": business.get("review_count"),
        "Latitud": business.get("coordinates", {}).get("latitude"),
        "Longitud": business.get("coordinates", {}).get("longitude"),
        "Fuente": "Yelp"
    } for business in data.get("businesses", [])]

def save_to_csv(data, filename):
    if not data:
        return
    
    keys = data[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

if __name__ == "__main__":
    try:
        print("Obteniendo datos de Google Maps...")
        google_data = fetch_google_restaurants(pages=3)
        print("Obteniendo datos de Yelp...")
        yelp_data = fetch_yelp_restaurants()

        google_filename = "google_restaurants.csv"
        yelp_filename = "yelp_restaurants.csv"
        save_to_csv(google_data, google_filename)
        save_to_csv(yelp_data, yelp_filename)

        print("Archivos CSV guardados localmente.")
    except Exception as e:
        print(f"Error: {str(e)}")


In [5]:
import requests
import json

# Reemplaza esto con tu API Key de Yelp
YELP_API_KEY = "81MeibfW8d_OHhY9oRupbSWAy3cBuNrvOZJPNC3hQWkPa-ZFMIdDKeN3_pu6G8cVOvbR-1r8hIPxWqQV_u56wDQPPHr4kDve44nvtlxE7e3RMrq2M72jXM9ZXwTPZ3Yx"

def fetch_yelp_restaurants():
    """Obtiene datos de Yelp API y los guarda en un archivo local."""
    if not YELP_API_KEY:
        print("❌ La API Key de Yelp no está configurada.")
        return

    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization": f"Bearer {YELP_API_KEY}"}
    params = {"location": "California", "term": "restaurants", "limit": 10}  # Limitamos a 10 para pruebas

    response = requests.get(url, headers=headers, params=params)

    if response.status_code != 200:
        print(f"❌ Error en la API de Yelp: {response.status_code} - {response.text}")
        return

    data = response.json()
    print(f"📊 Yelp API Response: {json.dumps(data, indent=2)}")

    restaurants = [
        {
            "Business_ID": r.get("id"),
            "Name": r.get("name"),
            "Address": ", ".join(r["location"]["display_address"]) if r.get("location") else None,
            "City": r["location"].get("city") if r.get("location") else None,
            "Category": ", ".join([c["title"] for c in r.get("categories", [])]),
            "Latitude": r["coordinates"].get("latitude") if r.get("coordinates") else None,
            "Longitude": r["coordinates"].get("longitude") if r.get("coordinates") else None,
            "Review_Count": r.get("review_count")
        }
        for r in data.get("businesses", [])
    ]

    if not restaurants:
        print("⚠️ Yelp API devolvió una lista vacía")
        return

    # Guardamos los datos en un archivo JSON local
    with open("yelp_restaurants.json", "w", encoding="utf-8") as f:
        json.dump(restaurants, f, indent=2, ensure_ascii=False)

    print("✅ Datos guardados en 'yelp_restaurants.json'")

# Ejecutamos la función
fetch_yelp_restaurants()

📊 Yelp API Response: {
  "businesses": [
    {
      "id": "2g3Af4y33ZMBLJzulZvbzQ",
      "alias": "mountain-oaks-cafe-oakhurst",
      "name": "Mountain Oaks Cafe",
      "image_url": "https://s3-media2.fl.yelpcdn.com/bphoto/MGjbVrY8G-nNtp07bHLlYA/o.jpg",
      "is_closed": false,
      "url": "https://www.yelp.com/biz/mountain-oaks-cafe-oakhurst?adjust_creative=IAymn76xD1Osju1LpSBcow&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IAymn76xD1Osju1LpSBcow",
      "review_count": 185,
      "categories": [
        {
          "alias": "tradamerican",
          "title": "American"
        },
        {
          "alias": "breakfast_brunch",
          "title": "Breakfast & Brunch"
        },
        {
          "alias": "burgers",
          "title": "Burgers"
        }
      ],
      "rating": 4.7,
      "coordinates": {
        "latitude": 37.3332671629224,
        "longitude": -119.6508896
      },
      "transactions": [],
      "price": "$$",
      "location": {


In [None]:
import os
import requests
import csv
import time
import json
from google.cloud import storage

# Configurar la autenticación con la clave de servicio JSON
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "proyectofinalgogleyelp-41e96ec7a40a.json"

# Configuración de APIs (REEMPLAZAR CON TUS KEYS)
GOOGLE_API_KEY = "AIzaSyBVFqawIp7wy2kTnY7ZVt9veOS7-yKfohU"
YELP_API_KEY = "81MeibfW8d_OHhY9oRupbSWAy3cBuNrvOZJPNC3hQWkPa-ZFMIdDKeN3_pu6G8cVOvbR-1r8hIPxWqQV_u56wDQPPHr4kDve44nvtlxE7e3RMrq2M72jXM9ZXwTPZ3Yx"


# Configuración de buckets
temp_bucket = "dataset-pf-gyelp-temporal"
final_bucket = "dataset-pf-gyelp"
storage_client = storage.Client()

def clear_bucket(bucket_name):
    """Elimina todos los archivos dentro de un bucket de GCP."""
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs()
    for blob in blobs:
        blob.delete()
    print(f"Se han eliminado todos los archivos en {bucket_name}")

def fetch_google_restaurants(location="California", pages=3):
    base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
    restaurants = []
    next_page_token = None
    
    for _ in range(pages):
        params = {
            "query": "restaurants in " + location,
            "key": GOOGLE_API_KEY,
            "type": "restaurant",
            "language": "es",
            "pagetoken": next_page_token if next_page_token else ""
        }
        
        response = requests.get(base_url, params=params)
        data = response.json()
        
        for result in data.get("results", []):
            restaurants.append({
                "Gmap_id": result.get("place_id"),
                "Nombre": result.get("name"),
                "Address": result.get("formatted_address"),
                "Avg_rating": result.get("rating"),
                "Reseñas": result.get("user_ratings_total"),
                "Latitud": result.get("geometry", {}).get("location", {}).get("lat"),
                "Longitud": result.get("geometry", {}).get("location", {}).get("lng"),
                "Fuente": "Google Maps"
            })
        
        next_page_token = data.get("next_page_token")
        if not next_page_token:
            break
        time.sleep(2)
    
    return restaurants

def fetch_yelp_restaurants(location="California"):
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization": f"Bearer {YELP_API_KEY}"}
    params = {
        "location": location,
        "term": "restaurants",
        "limit": 50
    }
    
    response = requests.get(url, headers=headers, params=params)
    data = response.json()
    
    return [{
        "Nombre": business.get("name"),
        "Address": ", ".join(business.get("location", {}).get("display_address", [])),
        "Avg_rating": business.get("rating"),
        "Reseñas": business.get("review_count"),
        "Latitud": business.get("coordinates", {}).get("latitude"),
        "Longitud": business.get("coordinates", {}).get("longitude"),
        "Fuente": "Yelp"
    } for business in data.get("businesses", [])]

def save_to_csv(data, filename):
    if not data:
        return
    
    keys = data[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    """Sube un archivo a Google Cloud Storage."""
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print(f"Archivo {source_file_name} subido a {bucket_name}/{destination_blob_name}")

# Ejecución principal
if __name__ == "__main__":
    try:
        print("Limpiando buckets...")
        clear_bucket(temp_bucket)
        clear_bucket(final_bucket)

        print("Obteniendo datos de Google Maps...")
        google_data = fetch_google_restaurants(pages=3)
        print("Obteniendo datos de Yelp...")
        yelp_data = fetch_yelp_restaurants()

        google_filename = "google_restaurants.csv"
        yelp_filename = "yelp_restaurants.csv"
        save_to_csv(google_data, google_filename)
        save_to_csv(yelp_data, yelp_filename)

        print("Subiendo archivos a GCS...")
        upload_to_gcs(temp_bucket, google_filename, f"Yelp/processed/{google_filename}")
        upload_to_gcs(temp_bucket, yelp_filename, f"Yelp/processed/{yelp_filename}")

        print("Proceso completado.")
    except Exception as e:
        print(f"Error: {str(e)}")

In [None]:
from google.cloud import storage
import os
import requests
import csv
import pandas as pd

gcs_client = storage.Client()
BUCKET_NAME = "dataset-pf-gyelp"
RAW_FOLDER = "Yelp/airFlow/raw/"
PROCESSED_FOLDER = "Yelp/airFlow/processed/"

def clear_bucket_folder(folder):
    """Elimina todos los archivos en una carpeta del bucket."""
    bucket = gcs_client.bucket(BUCKET_NAME)
    blobs = list(bucket.list_blobs(prefix=folder))
    
    if not blobs:
        print(f"No hay archivos en {BUCKET_NAME}/{folder} para eliminar.")
        return
    
    for blob in blobs:
        blob.delete()
    print(f"Se eliminaron {len(blobs)} archivos en {BUCKET_NAME}/{folder}")

def fetch_google_restaurants():
    """Obtiene datos de restaurantes de Google Places API y los guarda en un CSV."""
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
    params = {"query": "restaurants in California", "key": GOOGLE_API_KEY}
    response = requests.get(url, params=params)
    data = response.json()
    
    restaurants = [
        {"Business_ID": r.get("place_id"), "Name": r.get("name"), "Address": r.get("formatted_address"), "Rating": r.get("rating")} 
        for r in data.get("results", [])
    ]
    save_to_csv(restaurants, "google_restaurants.csv", RAW_FOLDER)

def fetch_yelp_restaurants():
    """Obtiene datos de restaurantes de Yelp API y los guarda en un CSV."""
    YELP_API_KEY = os.getenv("YELP_API_KEY")
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization": f"Bearer {YELP_API_KEY}"}
    params = {"location": "California", "term": "restaurants", "limit": 50}
    response = requests.get(url, headers=headers, params=params)
    data = response.json()
    
    restaurants = [
        {"Business_ID": b.get("id"), "Name": b.get("name"), "Address": ", ".join(b.get("location", {}).get("display_address", [])), "Rating": b.get("rating")} 
        for b in data.get("businesses", [])
    ]
    save_to_csv(restaurants, "yelp_restaurants.csv", RAW_FOLDER)

if __name__ == "__main__":
    # Limpiar los buckets antes de iniciar
    clear_bucket_folder(RAW_FOLDER)
    clear_bucket_folder(PROCESSED_FOLDER)

    # Ejecutar las funciones de recolección y procesamiento
    fetch_google_restaurants()
    fetch_yelp_restaurants()



In [None]:

from google.cloud import storage
import os
import requests
import csv
import pandas as pd

gcs_client = storage.Client()
BUCKET_NAME = "dataset-pf-gyelp"
RAW_FOLDER = "Yelp/airFlow/raw/"
PROCESSED_FOLDER = "Yelp/airFlow/processed/"

def clear_bucket_folder(folder):
    """Elimina todos los archivos en una carpeta del bucket."""
    bucket = gcs_client.bucket(BUCKET_NAME)
    blobs = list(bucket.list_blobs(prefix=folder))
    
    if not blobs:
        print(f"No hay archivos en {BUCKET_NAME}/{folder} para eliminar.")
        return
    
    for blob in blobs:
        blob.delete()
    print(f"Se eliminaron {len(blobs)} archivos en {BUCKET_NAME}/{folder}")

def fetch_google_restaurants():
    """Obtiene datos de restaurantes de Google Places API y los guarda en un CSV."""
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
    params = {"query": "restaurants in California", "key": GOOGLE_API_KEY}
    response = requests.get(url, params=params)
    data = response.json()
    
    restaurants = [
        {"Business_ID": r.get("place_id"), "Name": r.get("name"), "Address": r.get("formatted_address"), "Rating": r.get("rating")} 
        for r in data.get("results", [])
    ]
    save_to_csv(restaurants, "google_restaurants.csv", RAW_FOLDER)

def fetch_yelp_restaurants():
    """Obtiene datos de restaurantes de Yelp API y los guarda en un CSV."""
    YELP_API_KEY = os.getenv("YELP_API_KEY")
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization": f"Bearer {YELP_API_KEY}"}
    params = {"location": "California", "term": "restaurants", "limit": 50}
    response = requests.get(url, headers=headers, params=params)
    data = response.json()
    
    restaurants = [
        {"Business_ID": b.get("id"), "Name": b.get("name"), "Address": ", ".join(b.get("location", {}).get("display_address", [])), "Rating": b.get("rating")} 
        for b in data.get("businesses", [])
    ]
    save_to_csv(restaurants, "yelp_restaurants.csv", RAW_FOLDER)

if __name__ == "__main__":
    # Limpiar los buckets antes de iniciar
    clear_bucket_folder(RAW_FOLDER)
    clear_bucket_folder(PROCESSED_FOLDER)

    # Ejecutar las funciones de recolección y procesamiento
    fetch_google_restaurants()
    fetch_yelp_restaurants()


