In [0]:
from pyspark.sql.functions import split, col

Se realiza la función para  porder realizar la lectura de los archivos csv en spark

In [0]:
def read_hdfs(ruta_hdfs):
    csv = spark.read.format('csv')\
             .option("header", "true")\
             .option("inferSchema", "true")\
             .csv(ruta_hdfs)
    return csv

##### Se leen  los datos que se almacenaron en la carpeta de FileStore/tables de databricks

In [0]:
feature = read_hdfs('/FileStore/tables/features.csv')
sales = read_hdfs('/FileStore/tables/sales.csv')
stores = read_hdfs('/FileStore/tables/stores.csv')

##### Crear una tabla por cada uno de los ficheros de datos copiados en HDFS

In [0]:
feature.createOrReplaceTempView('feature')
sales.createOrReplaceTempView('sales')
stores.createOrReplaceTempView('stores')

In [0]:
# Cambiar las columnas que deben tener un valor numérico
feature = feature.withColumn("CPI",feature.CPI.cast('double'))
feature = feature.withColumn("Unemployment",feature.Unemployment.cast('double'))
sales = sales.withColumn("Date",sales.Date.cast('date'))

### Mostrar las cinco primeras filas de cada tabla cargada

In [0]:
# Mostrar las primeras 5 filas de las tablas
for element in ['feature', 'sales', 'stores']:
    query = f'''SELECT * FROM {element} LIMIT 5''' 
    print("Query utilizada para mostrar las 5 primeras filas de" , element, ":" , query)
    print("Tabla: ", element) 
    spark.sql(query).show()

Query utilizada para mostrar las 5 primeras filas de feature : SELECT * FROM feature LIMIT 5
Tabla:  feature
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+-----------+------------+---------+
|Store|      Date|Temperature|Fuel_Price|MarkDown1|MarkDown2|MarkDown3|MarkDown4|MarkDown5|        CPI|Unemployment|IsHoliday|
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+-----------+------------+---------+
|    1|05/02/2010|      42.31|     2.572|       NA|       NA|       NA|       NA|       NA|211.0963582|       8.106|    false|
|    1|12/02/2010|      38.51|     2.548|       NA|       NA|       NA|       NA|       NA|211.2421698|       8.106|     true|
|    1|19/02/2010|      39.93|     2.514|       NA|       NA|       NA|       NA|       NA|211.2891429|       8.106|    false|
|    1|26/02/2010|      46.63|     2.561|       NA|       NA|       NA|       NA|       NA|211.3196429|       8.106|    false|
| 

#### Se muestra el query utilizado para contar el numero de filas por tabla, así como el resultado de dicha consulta

In [0]:
for element in ['feature', 'sales', 'stores']:
    query = f'''SELECT COUNT(*) as {element} FROM {element}''' 
    print("Query utilizada para mostrar el numero de filas de la tabla" , element, ":" , query)
    print(f"El número de filas en {element} es : ")
    spark.sql(query).show()


Query utilizada para mostrar el numero de filas de la tabla feature : SELECT COUNT(*) as feature FROM feature
El número de filas en feature es : 
+-------+
|feature|
+-------+
|   8190|
+-------+

Query utilizada para mostrar el numero de filas de la tabla sales : SELECT COUNT(*) as sales FROM sales
El número de filas en sales es : 
+------+
| sales|
+------+
|421570|
+------+

Query utilizada para mostrar el numero de filas de la tabla stores : SELECT COUNT(*) as stores FROM stores
El número de filas en stores es : 
+------+
|stores|
+------+
|    45|
+------+



#### En los siguientes comandos se muestra el proceso que se siguio para obtener las variables numericas y despues obtener el minimo y máximo de cada una de estas

Primero mostraremos las variables por cada tabla

In [0]:
#Se imprime cada una de el tipo de variable de cada una de las tablas
feature.printSchema()
sales.printSchema()
stores.printSchema()

root
 |-- Store: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- MarkDown1: string (nullable = true)
 |-- MarkDown2: string (nullable = true)
 |-- MarkDown3: string (nullable = true)
 |-- MarkDown4: string (nullable = true)
 |-- MarkDown5: string (nullable = true)
 |-- CPI: double (nullable = true)
 |-- Unemployment: double (nullable = true)
 |-- IsHoliday: boolean (nullable = true)

root
 |-- Store: integer (nullable = true)
 |-- Dept: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Weekly_Sales: double (nullable = true)
 |-- IsHoliday: boolean (nullable = true)

root
 |-- Store: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Size: integer (nullable = true)



In [0]:
# Se crea la función para obtener todas las variables numericas de cada tabla
def get_number_column_name(table):
    schema = globals()[table].schema
    #Obtener el nombre de las columnas que son númericas
    columName = {table : [str(element).split('(')[1].split(',')[0] for element in schema if 'IntegerType' in str(element) or 'DoubleType' in str(element)]}
    return columName

In [0]:
#Se obtienen las variables numericas
columnFeature = get_number_column_name('feature')
columnSales = get_number_column_name('sales')
columnStores = get_number_column_name('stores')

En el siguiente comando se muestra la query utilizada para obtener el minimo y maximo de cada variable numerica

In [0]:
for table in [columnFeature, columnSales, columnStores]:
    tableName = list(table.keys())[0]
    print(tableName)
    for column in list(table.values())[0]:
        query = f'''SELECT MIN({column}) as Min_{column}, MAX(case when {column}='NA' then null else cast( {column} as double) end ) as Max_{column} FROM {tableName} ''' 
        #En caso de que existan NA es las variables numericas se modificaran a null para poder obtener el mayor numero
        print("La query utilizada para obtener el minimo y maximo de la varible", column, "de la tabla", tableName, "es:", query)
        spark.sql(query).show()

feature
La query utilizada para obtener el minimo y maximo de la varible Store de la tabla feature es: SELECT MIN(Store) as Min_Store, MAX(case when Store='NA' then null else cast( Store as double) end ) as Max_Store FROM feature 
+---------+---------+
|Min_Store|Max_Store|
+---------+---------+
|        1|     45.0|
+---------+---------+

La query utilizada para obtener el minimo y maximo de la varible Temperature de la tabla feature es: SELECT MIN(Temperature) as Min_Temperature, MAX(case when Temperature='NA' then null else cast( Temperature as double) end ) as Max_Temperature FROM feature 
+---------------+---------------+
|Min_Temperature|Max_Temperature|
+---------------+---------------+
|          -7.29|         101.95|
+---------------+---------------+

La query utilizada para obtener el minimo y maximo de la varible Fuel_Price de la tabla feature es: SELECT MIN(Fuel_Price) as Min_Fuel_Price, MAX(case when Fuel_Price='NA' then null else cast( Fuel_Price as double) end ) as Max_

### Estudiar las diferentes categorías de las principales variables categóricas y el número de filas correspondientes a cada categoría.

Las variables categoricas según (suport, minilab, sf) son valores de una variable categórica son categorías o grupos mutuamente excluyentes.
Partiendo de esa premisa se extrae las variables categoricas de las tablas dejando la siguiente estructura:
  - **sales:** IsHoliday
  - **feature:** IsHoliday
  - **stores:** Type

In [0]:
categories = { 'feature': ['IsHoliday'],'sales': ['IsHoliday'], 'stores':['Type']}

In [0]:
for element in categories:
    print(element)
    for category in categories[element]:
        query = f'''SELECT {category}, COUNT(*) as CountValues  FROM {element} GROUP BY {category}''' 
        print("La query utilizada para obtener el numero de filas de la varible categórica", category , "de la tabla", element, "es:", query)
        print(f"El número de filas en {element} es : ")
        spark.sql(query).show()


feature
La query utilizada para obtener el numero de filas de la varible categórica IsHoliday de la tabla feature es: SELECT IsHoliday, COUNT(*) as CountValues  FROM feature GROUP BY IsHoliday
El número de filas en feature es : 
+---------+-----------+
|IsHoliday|CountValues|
+---------+-----------+
|     true|        585|
|    false|       7605|
+---------+-----------+

sales
La query utilizada para obtener el numero de filas de la varible categórica IsHoliday de la tabla sales es: SELECT IsHoliday, COUNT(*) as CountValues  FROM sales GROUP BY IsHoliday
El número de filas en sales es : 
+---------+-----------+
|IsHoliday|CountValues|
+---------+-----------+
|     true|      29661|
|    false|     391909|
+---------+-----------+

stores
La query utilizada para obtener el numero de filas de la varible categórica Type de la tabla stores es: SELECT Type, COUNT(*) as CountValues  FROM stores GROUP BY Type
El número de filas en stores es : 
+----+-----------+
|Type|CountValues|
+----+------

### Buscar valores inexistentes o anómalos

#### Inexistentes

In [0]:
for element in ['feature', 'sales', 'stores']:
    print(element)
    columns = [element[0] for element in spark.sql(f'''SHOW COLUMNS FROM {element}''').collect()]
    for columName in columns:
        print(columName)
        query = f''' SELECT COUNT({columName}) as Filas_Totales, SUM(CASE WHEN {columName} IS NULL then 1 else 0 end) as NullCount, SUM(CASE WHEN {columName} = 'NA' then 1 else 0 end) as NACount FROM {element};''' 
        print(query)
        spark.sql(query).show()

feature
Store
 SELECT COUNT(Store) as Filas_Totales, SUM(CASE WHEN Store IS NULL then 1 else 0 end) as NullCount, SUM(CASE WHEN Store = 'NA' then 1 else 0 end) as NACount FROM feature;
+-------------+---------+-------+
|Filas_Totales|NullCount|NACount|
+-------------+---------+-------+
|         8190|        0|      0|
+-------------+---------+-------+

Date
 SELECT COUNT(Date) as Filas_Totales, SUM(CASE WHEN Date IS NULL then 1 else 0 end) as NullCount, SUM(CASE WHEN Date = 'NA' then 1 else 0 end) as NACount FROM feature;
+-------------+---------+-------+
|Filas_Totales|NullCount|NACount|
+-------------+---------+-------+
|         8190|        0|      0|
+-------------+---------+-------+

Temperature
 SELECT COUNT(Temperature) as Filas_Totales, SUM(CASE WHEN Temperature IS NULL then 1 else 0 end) as NullCount, SUM(CASE WHEN Temperature = 'NA' then 1 else 0 end) as NACount FROM feature;
+-------------+---------+-------+
|Filas_Totales|NullCount|NACount|
+-------------+---------+------

#### Anomalos
En base al documento anexos los valores anomalos son: 

**Feature**: Temperature (Al no especificar la procedencia de los datos, asumiremos que corresponden a una temperatura con grados Fahrenheit por lo cual los grados anomalos podrían ser los menores a **10 grados**

In [0]:
query = f'''SELECT COUNT(*) as Cantidad_Datos_Anomalos FROM feature WHERE Temperature < 10''' 
spark.sql(query).show()

+-----------------------+
|Cantidad_Datos_Anomalos|
+-----------------------+
|                     17|
+-----------------------+



**sales**: Weekly_Sales (No es posible que exista ventas en negativo)

In [0]:
query = f'''SELECT COUNT(*) as Cantidad_Datos_Anomalos FROM sales WHERE Weekly_Sales < 0;''' 
spark.sql(query).show()

+-----------------------+
|Cantidad_Datos_Anomalos|
+-----------------------+
|                   1285|
+-----------------------+



#### Se realiza una operación de join con los comandos de sql uniendo la tabla stores con features por la variable store y uniendo la variable feature con sales por la variable store

In [0]:
#Feature sales
query = f'''SELECT 
            stores.Type, 
            feature.IsHoliday, 
            sales.Date, 
            sales.Dept, 
            sales.Weekly_Sales 
            FROM stores 
            INNER JOIN feature 
             ON stores.Store = feature.Store 
            INNER JOIN sales 
             ON feature.Store = sales.Store 
             and feature.Date =sales.Date
            WHERE Weekly_Sales > 0 ''' 
#Join y creación de vista
data_report = spark.sql(query)
data_report = data_report.withColumn("Month", split(col("Date"), "/").getItem(1)).withColumn("Year", split(col("Date"), "/").getItem(2))
data_report.createOrReplaceTempView('data_report')

Se muestra la información de la tabla creada denominada "data_report"

In [0]:
#Probar vista creada
spark.sql(f'''SELECT * FROM data_report''').show()

+----+---------+----------+----+------------+-----+----+
|Type|IsHoliday|      Date|Dept|Weekly_Sales|Month|Year|
+----+---------+----------+----+------------+-----+----+
|   A|    false|05/02/2010|   1|     24924.5|   02|2010|
|   A|    false|02/04/2010|   1|    57258.43|   04|2010|
|   A|     true|02/04/2010|   1|    57258.43|   04|2010|
|   A|    false|19/03/2010|   1|    22136.64|   03|2010|
|   A|    false|26/03/2010|   1|    26229.21|   03|2010|
|   A|    false|09/04/2010|   1|    42960.91|   04|2010|
|   A|    false|16/04/2010|   1|    17596.96|   04|2010|
|   A|     true|12/02/2010|   1|    46039.49|   02|2010|
|   A|    false|12/03/2010|   1|    21043.39|   03|2010|
|   A|     true|19/02/2010|   1|    41595.55|   02|2010|
|   A|     true|19/03/2010|   1|    22136.64|   03|2010|
|   A|     true|12/03/2010|   1|    21043.39|   03|2010|
|   A|     true|26/02/2010|   1|    19403.54|   02|2010|
|   A|     true|09/04/2010|   1|    42960.91|   04|2010|
|   A|    false|26/02/2010|   1

## Ejercicio 

A la compañía en cuestión le gustaría, tras un primer análisis exploratorio sencillo, deducir alguna información interesante y que les pueda dar pistas sobre qué tal funcionan las ventas en cada tienda y en los departamentos de estas, cómo evolucionan las ventas a lo largo del año.

In [0]:
# Librerias para reporte 
import pandas as pd
import plotly.express as px


### Ventas en cada tipo de tienda por años

In [0]:
query = f'''SELECT Year, Type,  SUM(Weekly_Sales)  AS Tota_Sale_Year, MAX(Weekly_Sales) AS Higher_sales, MIN(Weekly_Sales) AS Lower_Sales, AVG(Weekly_Sales) AS Average_Sales
 FROM data_report GROUP BY Type, Year ORDER BY Year, Type  ASC;'''
ventas_anio = spark.sql(query)
ventas_anio.show()
df_ventas_anio = ventas_anio.toPandas()

+----+----+--------------------+------------+-----------+------------------+
|Year|Type|      Tota_Sale_Year|Higher_sales|Lower_Sales|     Average_Sales|
+----+----+--------------------+------------+-----------+------------------+
|2010|   A|2.667120366929903E11|    474330.1|       0.01|20375.839477487607|
|2010|   B|1.255779099866005...|   693099.36|       0.02|12675.224417846517|
|2010|   C|2.429329003009659...|   100712.42|       0.01| 9598.016506074731|
|2011|   A|2.872364877786299E11|   392023.02|       0.01|20159.959894259977|
|2011|   B|1.317927103730288...|   649770.18|       0.02|12233.883044388138|
|2011|   C|2.654913939313075...|   112152.35|       0.01| 9433.160389934636|
|2012|   A|2.343039366213181E11|   224917.94|       0.01| 19880.84730368465|
|2012|   B|1.067643596382634E11|   233140.32|       0.01|11932.818564886064|
|2012|   C|2.296003054435597E10|    110668.4|       0.02| 9635.225997095993|
+----+----+--------------------+------------+-----------+------------------+

In [0]:
df_ventas_line = df_ventas_anio[['Year', 'Type', 'Tota_Sale_Year']]
fig = px.line(df_ventas_line, x="Year", y="Tota_Sale_Year", title='Ventas por tipo de tienda cada año' , color='Type')
fig.show()

In [0]:
fig = px.bar(df_ventas_line.groupby(['Type']).sum().reset_index(), x='Type', y='Tota_Sale_Year', title='Ventas por tipo de tienda')
fig.show()

In [0]:
fig = px.bar(df_ventas_line.groupby(['Year']).sum().reset_index(), x='Year', y='Tota_Sale_Year', title='Ventas por año')
fig.show()

### Ventas por mes

In [0]:
query = f'''SELECT Year, Month, SUM(Weekly_Sales) AS Tota_Sale_Year, MAX(Weekly_Sales) AS Higher_sales, MIN(Weekly_Sales) AS Lower_Sales, AVG(Weekly_Sales) AS Average_Sales
 FROM data_report GROUP BY Month, Year ORDER BY Year, Month  ASC;'''
ventas_month = spark.sql(query)
ventas_month.show()
df_ventas_month = ventas_month.toPandas()

+----+-----+--------------------+------------+-----------+------------------+
|Year|Month|      Tota_Sale_Year|Higher_sales|Lower_Sales|     Average_Sales|
+----+-----+--------------------+------------+-----------+------------------+
|2010|   02| 3.806656513399999E8|   293966.05|       0.01|16169.639424857698|
|2010|   03| 3.638351836600001E8|   214383.07|       0.01|15507.424075526387|
|2010|   04|4.6281418883999974E8|   203457.42|       0.01|15824.871395746417|
|2010|   05| 3.734159322599999E8|   206160.36|       0.01|16116.354435045312|
|2010|   06|      3.8448945698E8|   194723.71|       0.03|16592.847271707233|
|2010|   07|4.6515327009999996E8|   198349.17|       0.01|16049.729835760125|
|2010|   08|3.7527143748000014E8|   204695.13|        0.1|16242.704184556793|
|2010|   09|      3.5452881574E8|   205314.67|       0.28| 15214.52303407433|
|2010|   10|4.3432655021999985E8|   210596.66|       0.01|14866.051143893752|
|2010|   11| 4.057066961200001E8|   693099.36|       0.25| 17403

In [0]:
df_ventas_month = df_ventas_month[['Year', 'Month', 'Tota_Sale_Year']]
fig = px.line(df_ventas_month.sort_values(by=['Month']), x="Month", y="Tota_Sale_Year", title='Ventas por mes cada año' , color='Year')
fig.show()

In [0]:
fig = px.bar(df_ventas_month.groupby(['Month']).sum().reset_index(), x='Month', y='Tota_Sale_Year', title='Ventas por mes')
fig.show()

### Ventas por tipo, mes y año

In [0]:
query = f'''SELECT Year, Type, Month, SUM(Weekly_Sales) AS Tota_Sale_Year, MAX(Weekly_Sales) AS Higher_sales, MIN(Weekly_Sales) AS Lower_Sales, AVG(Weekly_Sales) AS Average_Sales
 FROM data_report GROUP BY Month, Year, Type ORDER BY Year, Month, Type  ASC;'''
ventas_month_year = spark.sql(query)
ventas_month_year.show()
df_ventas_month_year = ventas_month_year.toPandas()

+----+----+-----+--------------------+------------+-----------+------------------+
|Year|Type|Month|      Tota_Sale_Year|Higher_sales|Lower_Sales|     Average_Sales|
+----+----+-----+--------------------+------------+-----------+------------------+
|2010|   A|   02|2.4427103737999994E8|   293966.05|       0.01|  20284.9225527321|
|2010|   B|   02|1.1402183407999995E8|   232558.51|       0.94| 12412.56630524711|
|2010|   C|   02|2.2372779880000003E7|    95351.96|       0.02| 9668.444200518583|
|2010|   A|   03|2.3203036768000022E8|   214383.07|       0.01| 19335.86397333335|
|2010|   B|   03|1.0968618406000002E8|   191989.54|       0.94|12026.993866228073|
|2010|   C|   03|       2.211863192E7|    95079.18|       0.01| 9444.334722459436|
|2010|   A|   04| 2.959861897200003E8|   203457.42|       0.01| 19785.17310962569|
|2010|   B|   04|1.3899400318000013E8|   145589.34|        0.1|12220.327341304741|
|2010|   C|   04|2.7833995940000005E7|    92534.76|        0.1| 9558.377726648354|
|201

In [0]:
fig = px.line(df_ventas_month_year.sort_values(by=['Month']), x="Month", y="Tota_Sale_Year", title='Ventas por mes cada mes por tipo de tienda' , color='Type')
fig.show()