In [57]:
import sys
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import datetime
from pyspark.sql.functions import expr
from pyspark.sql.functions import col
from pyspark.sql.functions import lower


# Definimos el mapping que aplicará la transformación a la columna 'time'
schema = StructType([
            StructField("name", StringType(), True),
            StructField("address", StringType(), True),
            StructField("gmap_id", StringType(), True),
            StructField("description", StringType(), True),
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True),
            StructField("category", StringType(), True),
            StructField("num_of_reviews", IntegerType(), True),
            StructField("avg_rating", FloatType(), True),
            StructField("price", FloatType(), True),
            StructField("hours", StringType(), True),
            StructField("MISC", StringType(), True)
            ])


spark = SparkSession.builder.appName("MiApp").getOrCreate()

df = spark.read.csv("/home/ezequiell/Descargas/metada_newyork.csv", header=True, schema=schema)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- gmap_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- category: string (nullable = true)
 |-- num_of_reviews: integer (nullable = true)
 |-- avg_rating: float (nullable = true)
 |-- price: float (nullable = true)
 |-- hours: string (nullable = true)
 |-- MISC: string (nullable = true)

23/02/21 16:46:56 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 15, schema size: 12
CSV file: file:///home/ezequiell/Descargas/metada_newyork.csv
+--------------------+--------------------+--------------------+--------------------+---------+----------+--------------------+--------------+----------+-----+--------------------+--------------------+
|                name|             address|             gmap_id|         description| latitude| longi

In [58]:
# filtramos solo el rubro de restaurantes que es en lo que se enfocara el sistema de recomendacion
df = df.filter(lower(df["category"]).like("%restaurant%"))


df2 = df.select("*", from_json("MISC", "map<string,string>").alias("mapa_misc"))

# Usar explode para crear una fila por cada clave del diccionario
df3 = df2.select("*", explode("mapa_misc").alias("clave", "valor"))

df3 = df3.withColumn("clave", regexp_replace("clave", " ", "_"))

# Usar pivot para convertir las claves en columnas
df4 = df3.groupBy("gmap_id").pivot("clave").agg({"valor": "first"})

#df4.printSchema()

# Unir el resultado de pivot con las columnas originales
df_final = df3.join(df4, on="gmap_id")

#df_final.printSchema()

df = df_final.select('gmap_id', 'name', 'address', 'latitude', 'longitude', 'num_of_reviews', 'avg_rating', 'category', 'Dining_options', 'Service_options', 'Payments')

df = df \
    .select(col("gmap_id").alias("gmap_id"), \
            col("name").alias("name"), \
            col("address").alias("address"), \
            col("latitude").alias("latitude"), \
            col("longitude").alias("longitude"), \
            col("num_of_reviews").alias("num_reviews"), \
            col("avg_rating").alias("avg_rating"), \
            col("category").alias("category"), \
            col("Dining_options").alias("dining_options"), \
            col("Service_options").alias("service_options"), \
            col("Payments").alias("payments")) \
    .dropDuplicates()

df.printSchema()
df.show()

                                                                                

root
 |-- gmap_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- num_reviews: integer (nullable = true)
 |-- avg_rating: float (nullable = true)
 |-- category: string (nullable = true)
 |-- dining_options: string (nullable = true)
 |-- service_options: string (nullable = true)
 |-- payments: string (nullable = true)

23/02/21 16:46:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: name, address, gmap_id, latitude, longitude, category, avg_rating, num_of_reviews, MISC
 Schema: name, address, gmap_id, latitude, longitude, category, num_of_reviews, avg_rating, MISC
Expected: num_of_reviews but found: avg_rating
CSV file: file:///home/ezequiell/Descargas/metada_newyork.csv


                                                                                

+--------------------+--------------------+--------------------+---------+----------+-----------+----------+--------------------+--------------------+--------------------+--------------------+
|             gmap_id|                name|             address| latitude| longitude|num_reviews|avg_rating|            category|      dining_options|     service_options|            payments|
+--------------------+--------------------+--------------------+---------+----------+-----------+----------+--------------------+--------------------+--------------------+--------------------+
|0x4cca3876b359eea...|Homestead Restaurant|Homestead Restaur...|44.705822| -73.51141|       null|      78.0|['Breakfast resta...|                null|        ["Delivery"]|                null|
|0x4cca38bd56e9b69...|  Meron's Restaurant|Meron's Restauran...|44.704376| -73.46721|       null|      83.0|['Bar', 'American...|                null|["Takeout","Deliv...|                null|
|0x4cca477e599e607...|Little Caesar

In [1]:
import pandas as pd

parquet = pd.read_parquet('/home/ezequiell/Descargas/part-00000-0991daf0-ef5c-4869-8d8d-2074d75673e5-c000.snappy.parquet')

        
parquet

Unnamed: 0,gmap_id,name,address,latitude,longitude,num_reviews,avg_rating,category,dining_options,service_options,payments
0,0x405bccf4faffeddb:0x62d94b9cf01c7343,Mellow Mug,"Mellow Mug, 616 Pittsford Victor Rd, Pittsford...",43.062222,-77.479034,88,4.4,"['Coffee shop', 'Bar', 'Bistro', 'Cafe', 'Wine...",,"[""Takeout"",""Delivery""]",
1,0x4065fceb0cc40965:0xb22a3c3f2333007f,GNC,"GNC, 549 6th Ave, New York, NY 10011",40.738014,-73.996620,28,3.7,"['Vitamin & supplements store', 'Health food s...",,"[""In-store pickup"",""In-store shopping""]","[""Checks"",""Debit cards"",""NFC mobile payments""]"
2,0x4065fd2182da474b:0x4ce97af5b7bc6aaa,Hale and Hearty,"Hale and Hearty, 369 Lexington Ave, New York, ...",40.750408,-73.976479,25,4.2,"['Soup shop', 'Caterer', 'Delivery Restaurant'...","[""Dessert"",""Seating""]","[""No-contact delivery"",""Delivery"",""Takeout""]","[""Debit cards"",""NFC mobile payments"",""Credit c..."
3,0x4065fd21ad7ef4b9:0x85921e7c5ca9a854,ManhattanTechSupport.com LLC - IT Support & Se...,ManhattanTechSupport.com LLC - IT Support & Se...,40.752842,-73.984795,115,4.9,"['Computer support and services', 'Business to...",,,
4,0x4cb55591479ba9b1:0xb7414620dd33a3c8,Tonys Ticonderoga Sports Inc,"Tonys Ticonderoga Sports Inc, 1186 NY-9N, Tico...",43.860607,-73.437401,77,4.2,"['Car inspection station', 'ATV dealer', 'Auto...",,,
...,...,...,...,...,...,...,...,...,...,...,...
59507,0x89ef53545fe220fd:0xfb291378fd84cb23,Puff & Putt Family Fun Center,"Puff & Putt Family Fun Center, 659 Montauk Hwy...",41.032173,-71.948837,68,4.5,"['Miniature golf course', 'Boat rental service...",,,
59508,0x89ef53566c137d75:0xae02dd8a1886d1ae,M&R DELI,"M&R DELI, 728 Montauk Hwy, Montauk, NY 11954",41.033882,-71.945023,65,4.5,['Deli'],,"[""Delivery"",""In-store pickup"",""In-store shoppi...",
59509,0x89ef53567024873f:0xa1bcae42b612aa21,Montauk Surf & Sports,"Montauk Surf & Sports, 716 Main Street, Montau...",41.033737,-71.945236,38,3.9,['Sporting goods store'],,"[""Curbside pickup"",""In-store pickup"",""In-store...",
59510,0x89ef54a82b96b90d:0xd3a4f7249d42450d,Herb's Market,"Herb's Market, 778 Montauk Hwy, Montauk, NY 11954",41.035301,-71.942902,24,4.2,['Butcher shop'],,"[""In-store pickup"",""In-store shopping""]","[""NFC mobile payments""]"
