# Ejercicio 1

In [1]:
#Para convertir path linux en uri's
from pathToUri import toUri
import os
import findspark
findspark.init(os.environ['SPARK_HOME'])

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window

spark = SparkSession.builder.appName('cars').getOrCreate()

ruta = toUri("../../../pec2/")

In [3]:
cars = spark.read.format('csv') \
            .option('sep', ";") \
            .option('header','true') \
            .option('inferSchema','true') \
            .load(ruta + "/cars.csv")

In [4]:
# Muestro los 5 primeros
cars.show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|  3504|        12.0|   70|    US|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|  3693|        11.5|   70|    US|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|  3436|        11.0|   70|    US|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|  3433|        12.0|   70|    US|
|         Ford Torino|17.0|        8|       302.0|     140.0|  3449|        10.5|   70|    US|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
only showing top 5 rows



In [5]:
# Muestro la estructura
cars.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: decimal(4,0) (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [6]:
# Mostrar las columnas "Cars" y "Cylinders" de los vehículos de Europa
cars.select("Car", "Cylinders").filter(F.col("Origin")=="Europe").show()

+--------------------+---------+
|                 Car|Cylinders|
+--------------------+---------+
|Citroen DS-21 Pallas|        4|
|Volkswagen 1131 D...|        4|
|         Peugeot 504|        4|
|         Audi 100 LS|        4|
|            Saab 99e|        4|
|            BMW 2002|        4|
|Volkswagen Super ...|        4|
|           Opel 1900|        4|
|         Peugeot 304|        4|
|           Fiat 124B|        4|
|Volkswagen Model 111|        4|
|   Volkswagen Type 3|        4|
|     Volvo 145e (sw)|        4|
| Volkswagen 411 (sw)|        4|
|    Peugeot 504 (sw)|        4|
|     Renault 12 (sw)|        4|
|Volkswagen Super ...|        4|
|Fiat 124 Sport Coupe|        4|
|            Fiat 128|        4|
|          Opel Manta|        4|
+--------------------+---------+
only showing top 20 rows



In [7]:
# Obtener la media de "Horsepower", "Weight" y "Acceleration" por "origen"

cars.groupBy("Origin").agg(F.mean("Horsepower").alias("Media Horsepower"),F.mean("Weight").alias("Media Weight"),F.mean("Acceleration").alias("Media Acceleration")).show()

+------+------------------+------------+------------------+
|Origin|  Media Horsepower|Media Weight|Media Acceleration|
+------+------------------+------------+------------------+
|Europe| 78.78082191780823|   2431.4932| 16.82191780821918|
|    US|118.01181102362204|   3372.7008|14.942519685039361|
| Japan| 79.83544303797468|   2221.2278|16.172151898734175|
+------+------------------+------------+------------------+



In [8]:
# Calcular el ratio entre potencia y peso y a continuación sacar la media por cantidad de cilindros 
cars = cars.withColumn("RatioPotPeso",F.col("Horsepower")/F.col("Weight"))

In [9]:
cars.groupBy(F.col("Cylinders")).agg(F.mean("RatioPotPeso").alias("Media Ratio Potencia Peso por Cilindros")).show()

+---------+---------------------------------------+
|Cylinders|Media Ratio Potencia Peso por Cilindros|
+---------+---------------------------------------+
|        6|                    0.03163238614784404|
|        3|                   0.041441818290915214|
|        5|                   0.026973551761855744|
|        4|                   0.033217183371144564|
|        8|                    0.03872856292574305|
+---------+---------------------------------------+



In [10]:
# Queremos tener información de la potencia (Horsepower) de los vehículos según su cilindrada, para ello hay que elaborar el ranking de potencia de los vehículos por cilindors. Se necesita presentar los dos primeros vehículos de cada cilindrada. 

In [40]:
windowsSpec = Window \
    .partitionBy("Cylinders").orderBy(F.desc("Horsepower")).rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [41]:
from pyspark.sql.functions import max, col, rank

In [42]:
maxHorsepower = max(cars["Horsepower"]).over(windowsSpec)

In [43]:
cars.select(cars['Cylinders'],F.col('Horsepower'),maxHorsepower.alias('MaxHorsepower')).show(20)

+---------+----------+-------------+
|Cylinders|Horsepower|MaxHorsepower|
+---------+----------+-------------+
|        6|     165.0|        165.0|
|        6|     133.0|        165.0|
|        6|     132.0|        165.0|
|        6|     125.0|        165.0|
|        6|     122.0|        165.0|
|        6|     120.0|        165.0|
|        6|     120.0|        165.0|
|        6|     120.0|        165.0|
|        6|     116.0|        165.0|
|        6|     115.0|        165.0|
|        6|     115.0|        165.0|
|        6|     115.0|        165.0|
|        6|     112.0|        165.0|
|        6|     110.0|        165.0|
|        6|     110.0|        165.0|
|        6|     110.0|        165.0|
|        6|     110.0|        165.0|
|        6|     110.0|        165.0|
|        6|     110.0|        165.0|
|        6|     110.0|        165.0|
+---------+----------+-------------+
only showing top 20 rows



In [44]:
rankHorsepower = rank().over(windowsSpec)

In [51]:
cars.select(cars['Cylinders'],cars['Horsepower'], cars["Car"], \
            maxHorsepower.alias('MaxHorsepower'), \
           rankHorsepower.alias('rankingHorsepower')).show()

+---------+----------+--------------------+-------------+-----------------+
|Cylinders|Horsepower|                 Car|MaxHorsepower|rankingHorsepower|
+---------+----------+--------------------+-------------+-----------------+
|        6|     165.0|Buick Regal Sport...|        165.0|                1|
|        6|     133.0|       Peugeot 604sl|        165.0|                2|
|        6|     132.0|       Datsun 280-ZX|        165.0|                3|
|        6|     125.0|         Volvo 264gl|        165.0|                4|
|        6|     122.0|      Toyota Mark II|        165.0|                5|
|        6|     120.0|  Mercedes-Benz 280s|        165.0|                6|
|        6|     120.0|     AMC Concord d/l|        165.0|                6|
|        6|     120.0|   Datsun 810 Maxima|        165.0|                6|
|        6|     116.0|     Toyota Cressida|        165.0|                9|
|        6|     115.0|   Pontiac Lemans V6|        165.0|               10|
|        6| 

In [56]:
cars.select( cars['Cylinders'],cars['Horsepower'], cars['Car'], \
           rankHorsepower.alias('rankHorsepower')). \
            where((F.col('rankHorsepower')== 1) | (col('rankHorsepower')== 2)).show()

+---------+----------+--------------------+--------------+
|Cylinders|Horsepower|                 Car|rankHorsepower|
+---------+----------+--------------------+--------------+
|        6|     165.0|Buick Regal Sport...|             1|
|        6|     133.0|       Peugeot 604sl|             2|
|        3|     110.0|          Mazda RX-4|             1|
|        3|     100.0|       Mazda RX-7 GS|             2|
|        5|     103.0|           Audi 5000|             1|
|        5|      77.0|  Mercedes Benz 300d|             2|
|        4|     115.0|Citroen DS-21 Pallas|             1|
|        4|     115.0|           Saab 99LE|             1|
|        4|     115.0|          Saab 99gle|             1|
|        8|     230.0|  Pontiac Grand Prix|             1|
|        8|     225.0|    Pontiac Catalina|             2|
|        8|     225.0|Buick Estate Wago...|             2|
|        8|     225.0|Buick Electra 225...|             2|
+---------+----------+--------------------+-------------