In [0]:
from pyspark.sql.functions import col

In [0]:
def read_hdfs(ruta_hdfs):
    csv = spark.read.format('csv')\
             .option("header", "true")\
             .option("inferSchema", "true")\
             .csv(ruta_hdfs)
    return csv

## Leer los datos almacenados

In [0]:
feature = read_hdfs('/FileStore/tables/features.csv')
sales = read_hdfs('/FileStore/tables/sales.csv')
stores = read_hdfs('/FileStore/tables/stores.csv')

### Crear una tabla por cada uno de los ficheros de datos copiados en HDFS

In [0]:
feature.createOrReplaceTempView('feature')
sales.createOrReplaceTempView('sales')
stores.createOrReplaceTempView('stores')

### Mostrar las cinco primeras filas de cada tabla cargada

In [0]:
# Mostrar las primeras 5 filas de las tablas cargadas
for element in ['feature', 'sales', 'stores']:
    query = f'''SELECT * FROM {element} LIMIT 5''' 
    print(element) 
    spark.sql(query).show()

feature
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+-----------+------------+---------+
|Store|      Date|Temperature|Fuel_Price|MarkDown1|MarkDown2|MarkDown3|MarkDown4|MarkDown5|        CPI|Unemployment|IsHoliday|
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+-----------+------------+---------+
|    1|05/02/2010|      42.31|     2.572|       NA|       NA|       NA|       NA|       NA|211.0963582|       8.106|    false|
|    1|12/02/2010|      38.51|     2.548|       NA|       NA|       NA|       NA|       NA|211.2421698|       8.106|     true|
|    1|19/02/2010|      39.93|     2.514|       NA|       NA|       NA|       NA|       NA|211.2891429|       8.106|    false|
|    1|26/02/2010|      46.63|     2.561|       NA|       NA|       NA|       NA|       NA|211.3196429|       8.106|    false|
|    1|05/03/2010|       46.5|     2.625|       NA|       NA|       NA|       NA|       NA|211.3501429|

### Contar el número de filas de cada tabla

In [0]:
for element in ['feature', 'sales', 'stores']:
    query = f'''SELECT COUNT(*) as {element} FROM {element}''' 
    print(f"El número de filas en {element} es : ")
    spark.sql(query).show()


El número de filas en feature es : 
+-------+
|feature|
+-------+
|   8190|
+-------+

El número de filas en sales es : 
+------+
| sales|
+------+
|421570|
+------+

El número de filas en stores es : 
+------+
|stores|
+------+
|    45|
+------+



### El rango (máximo y mínimo) de cada variable numérica

In [0]:
# Obtener el esquema de cada tabla
def get_number_column_name(table):
    schema = globals()[table].schema
    #Obtener el nombre de las columnas que son númericas
    columName = {table : [str(element).split('(')[1].split(',')[0] for element in schema if 'IntegerType' in str(element) or 'DoubleType' in str(element)]}
    return columName

In [0]:
columnFeature = get_number_column_name('feature')
columnSales = get_number_column_name('sales')
columnStores = get_number_column_name('stores')

In [0]:
for table in [columnFeature, columnSales, columnStores]:
    tableName = list(table.keys())[0]
    print(tableName)
    for column in list(table.values())[0]:
        query = f'''SELECT MIN({column}), MAX({column}) FROM {tableName}''' 
        spark.sql(query).show()

feature
+----------+----------+
|min(Store)|max(Store)|
+----------+----------+
|         1|        45|
+----------+----------+

+----------------+----------------+
|min(Temperature)|max(Temperature)|
+----------------+----------------+
|           -7.29|          101.95|
+----------------+----------------+

+---------------+---------------+
|min(Fuel_Price)|max(Fuel_Price)|
+---------------+---------------+
|          2.472|          4.468|
+---------------+---------------+

sales
+----------+----------+
|min(Store)|max(Store)|
+----------+----------+
|         1|        45|
+----------+----------+

+---------+---------+
|min(Dept)|max(Dept)|
+---------+---------+
|        1|       99|
+---------+---------+

+-----------------+-----------------+
|min(Weekly_Sales)|max(Weekly_Sales)|
+-----------------+-----------------+
|         -4988.94|        693099.36|
+-----------------+-----------------+

stores
+----------+----------+
|min(Store)|max(Store)|
+----------+----------+
|         1