# 1.- Realizar todos los ejercicios propuestos de libro

## Codigo página 85

In [2]:
# In Python
from pyspark.sql import SparkSession 
# Create a SparkSession
spark = (SparkSession
 .builder
 .appName("SparkSQLExampleApp")
 .getOrCreate())

In [3]:
# Path to data set
csv_file = "C:/Users/alejandro.perez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
# Read and create a temporary view
# Infer schema (note that for larger files you 
# may want to specify the schema)
df = (spark.read.format("csv")
 .option("inferSchema", "true")
 .option("header", "true")
 .load(csv_file))
df.createOrReplaceTempView("us_delay_flights_tbl")

In [4]:
schema = "`date` STRING, `delay` INT, `distance` INT, `origin` STRING, `destination` STRING"


In [5]:
spark.sql("""SELECT distance, origin, destination 
FROM us_delay_flights_tbl WHERE distance > 1000 
ORDER BY distance DESC""").show(10)


+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



### Como resultado vemos que los vuelos de mayor distancia se producen entre Nueva York (JFK) y Honolulu (HNL)

In [6]:
spark.sql("""SELECT date, delay, origin, destination 
FROM us_delay_flights_tbl 
WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD' 
ORDER by delay DESC""").show(10)


+-------+-----+------+-----------+
|   date|delay|origin|destination|
+-------+-----+------+-----------+
|2190925| 1638|   SFO|        ORD|
|1031755|  396|   SFO|        ORD|
|1022330|  326|   SFO|        ORD|
|1051205|  320|   SFO|        ORD|
|1190925|  297|   SFO|        ORD|
|2171115|  296|   SFO|        ORD|
|1071040|  279|   SFO|        ORD|
|1051550|  274|   SFO|        ORD|
|3120730|  266|   SFO|        ORD|
|1261104|  258|   SFO|        ORD|
+-------+-----+------+-----------+
only showing top 10 rows



### Podemos apreciar que hay un retraso significativo entre estas dos ciudades en días diferentes

In [7]:
spark.sql("""SELECT delay, origin, destination,
 CASE
 WHEN delay > 360 THEN 'Very Long Delays'
 WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
 WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
 WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
 WHEN delay = 0 THEN 'No Delays'
 ELSE 'Early'
 END AS Flight_Delays
 FROM us_delay_flights_tbl
 ORDER BY origin, delay DESC""").show(10)


+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



# 2.-GlobalTempView vs TempView

Las vistas temporales (TempView) se encuentran ligadas a al sesión de Spark y en caso de que cerremos la sesión perderemos la vista temporal. Las vistas globales (GlobalTempView) se encuentran ligadas al sistema y se **preservan en la base de datos 'global_temp'**

# 3.-Leer los AVRO, Parquet, JSON y CSV escritos en el cap3

In [8]:
mnmAvro= spark.read.format("avro").load("C:/Users/alejandro.perez/Documents/Ejercicios Spark/Tema3 Learning Spark/mnm.avro")

In [9]:
mnmAvro.printSchema()

root
 |-- State: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Count: integer (nullable = true)



In [10]:
mnmAvro.show()

+-----+------+-----+
|State| Color|Count|
+-----+------+-----+
|   TX|   Red|   20|
|   NV|  Blue|   66|
|   CO|  Blue|   79|
|   OR|  Blue|   71|
|   WA|Yellow|   93|
|   WY|  Blue|   16|
|   CA|Yellow|   53|
|   WA| Green|   60|
|   OR| Green|   71|
|   TX| Green|   68|
|   NV| Green|   59|
|   AZ| Brown|   95|
|   WA|Yellow|   20|
|   AZ|  Blue|   75|
|   OR| Brown|   72|
|   NV|   Red|   98|
|   WY|Orange|   45|
|   CO|  Blue|   52|
|   TX| Brown|   94|
|   CO|   Red|   82|
+-----+------+-----+
only showing top 20 rows



In [11]:
mnmJSON= spark.read.format("json").load("C:/Users/alejandro.perez/Documents/Ejercicios Spark/Tema3 Learning Spark/mnm.json")

In [12]:
mnmJSON.printSchema()

root
 |-- Color: string (nullable = true)
 |-- Count: long (nullable = true)
 |-- State: string (nullable = true)



In [13]:
mnmJSON.show()

+------+-----+-----+
| Color|Count|State|
+------+-----+-----+
|   Red|   20|   TX|
|  Blue|   66|   NV|
|  Blue|   79|   CO|
|  Blue|   71|   OR|
|Yellow|   93|   WA|
|  Blue|   16|   WY|
|Yellow|   53|   CA|
| Green|   60|   WA|
| Green|   71|   OR|
| Green|   68|   TX|
| Green|   59|   NV|
| Brown|   95|   AZ|
|Yellow|   20|   WA|
|  Blue|   75|   AZ|
| Brown|   72|   OR|
|   Red|   98|   NV|
|Orange|   45|   WY|
|  Blue|   52|   CO|
| Brown|   94|   TX|
|   Red|   82|   CO|
+------+-----+-----+
only showing top 20 rows



In [17]:
mnmCSV= spark.read.format("csv").option("inferSchema", "true").option("header", "false").load("C:/Users/alejandro.perez/Documents/Ejercicios Spark/Tema3 Learning Spark/mnm.csv")

In [18]:
mnmCSV.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: integer (nullable = true)



In [19]:
mnmCSV.show()

+---+------+---+
|_c0|   _c1|_c2|
+---+------+---+
| TX|   Red| 20|
| NV|  Blue| 66|
| CO|  Blue| 79|
| OR|  Blue| 71|
| WA|Yellow| 93|
| WY|  Blue| 16|
| CA|Yellow| 53|
| WA| Green| 60|
| OR| Green| 71|
| TX| Green| 68|
| NV| Green| 59|
| AZ| Brown| 95|
| WA|Yellow| 20|
| AZ|  Blue| 75|
| OR| Brown| 72|
| NV|   Red| 98|
| WY|Orange| 45|
| CO|  Blue| 52|
| TX| Brown| 94|
| CO|   Red| 82|
+---+------+---+
only showing top 20 rows



In [20]:
fireparquet= spark.read.format("parquet").load("C:/Users/alejandro.perez/Documents/Ejercicios Spark/Tema3 Learning Spark/Guardar7/fire.parquet")

In [21]:
fireparquet.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 |-- SupervisorDistrict: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Location: string (nullable =

In [22]:
fireparquet.show(5)

+----------+------+--------------+----------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+------------------+--------------------+-------------+---------------------+-------------------+-------------------+-------------------+
|CallNumber|UnitID|IncidentNumber|        CallType|CallFinalDisposition|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|      Neighborhood|            Location|        RowID|ResponseDelayedinMins|       IncidentDate|        OnWatchDate|      AvailableDtTS|
+----------+------+--------------+----------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-