In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from datetime import datetime
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DecimalType

In [2]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [3]:
warehouse_location = '/files'

In [4]:
spark = SparkSession \
    .builder \
    .appName("App de Spark para QQP") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()

24/05/29 20:52:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
spark

In [6]:
!hdfs dfs -ls /user/QQP | head -n 6

Found 452 items
-rw-r--r--   2 root hadoop  180751333 2024-05-28 16:20 /user/QQP/01-2023_01.csv
-rw-r--r--   2 root hadoop  234811063 2024-05-28 16:14 /user/QQP/01-2023_02.csv
-rw-r--r--   2 root hadoop  178013160 2024-05-28 15:47 /user/QQP/01-2024_01.csv
-rw-r--r--   2 root hadoop  226158794 2024-05-28 15:46 /user/QQP/01-2024_02.csv
-rw-r--r--   2 root hadoop   16997440 2024-05-28 15:45 /user/QQP/012015.csv


In [7]:
%%time
!hdfs dfs -count /user/QQP

           1          452        42911913094 /user/QQP
CPU times: user 47.7 ms, sys: 16.8 ms, total: 64.5 ms
Wall time: 3.62 s


In [8]:
schema = StructType([
    StructField("PRODUCTO", StringType(), True),
    StructField("PRESENTACIÓN", StringType(), True),
    StructField("MARCA", StringType(), True),
    StructField("CATEGORÍA", StringType(), True),
    StructField("CATÁLOGO", StringType(), True),
    StructField("PRECIO", DecimalType(18, 2), True),
    StructField("FECHAREGISTRO", StringType(), True),
    StructField("CADENACOMERCIAL", StringType(), True),
    StructField("GIRO", StringType(), True),
    StructField("NOMBRECOMERCIAL", StringType(), True),
    StructField("DIRECCIÓN", StringType(), True),
    StructField("ESTADO", StringType(), True),
    StructField("MUNICIPIO", StringType(), True),
    StructField("LATITUD", DecimalType(18, 6), True),
    StructField("LONGITUD", DecimalType(18, 6), True)
])

In [9]:
df = spark.read.csv('/user/QQP/', sep=',', header=False, schema=schema)

In [10]:
# Get the list of input files
file_list = df.inputFiles()
print("Number of input files: ", len(file_list))

Number of input files:  452


In [11]:
%%time
df.count()

[Stage 2:>                                                          (0 + 1) / 1]

CPU times: user 292 ms, sys: 34.5 ms, total: 327 ms
Wall time: 3min 40s


                                                                                

136472358

In [12]:
df.show(10)

                                                                                

+---------------+--------------------+--------------+--------------------+------------+------+-------------+---------------+--------------------+--------------------+--------------------+--------------+--------------+---------+-----------+
|       PRODUCTO|        PRESENTACIÓN|         MARCA|           CATEGORÍA|    CATÁLOGO|PRECIO|FECHAREGISTRO|CADENACOMERCIAL|                GIRO|     NOMBRECOMERCIAL|           DIRECCIÓN|        ESTADO|     MUNICIPIO|  LATITUD|   LONGITUD|
+---------------+--------------------+--------------+--------------------+------------+------+-------------+---------------+--------------------+--------------------+--------------------+--------------+--------------+---------+-----------+
|        A.S.COR|FRASCO GOTERO 24 ...|           S/M|        MEDICAMENTOS|MEDICAMENTOS|389.00|   2023-01-02|   MEGA SORIANA|SUPERMERCADO / TI...|MEGA SORIANA SUCU...|AGUASCALIENTES NT...|AGUASCALIENTES|AGUASCALIENTES|21.916001|-102.290779|
|         ACEITE|BOTELLA 1 LT. VEG...|  

In [13]:
df.write.mode('overwrite').saveAsTable('QQP')

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
24/05/29 21:08:25 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


In [14]:
spark.stop()