In [1]:
! sudo apt-get update
! sudo mkdir -p /usr/share/man/man1
! sudo apt-get install -y openjdk-11-jdk
! pip install pyspark

Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://deb.debian.org/debian-security buster/updates InRelease [34.8 kB]
Get:3 http://deb.debian.org/debian buster-updates InRelease [56.6 kB]
Get:4 http://deb.debian.org/debian buster/main amd64 Packages [7,909 kB]
Get:5 http://deb.debian.org/debian-security buster/updates/main amd64 Packages [480 kB]
Get:6 http://deb.debian.org/debian buster-updates/main amd64 Packages [8,788 B]
Fetched 8,611 kB in 2s (5,096 kB/s)




The following additional packages will be installed:
  at-spi2-core ca-certificates-java dbus dbus-user-session
  dconf-gsettings-backend dconf-service dmsetup fonts-dejavu-extra
  glib-networking glib-networking-common glib-networking-services
  gsettings-desktop-schemas java-common libapparmor1 libargon2-1 libasound2
  libasound2-data libatk-bridge2.0-0 libatk-wrapper-java
  libatk-wrapper-java-jni libatspi2.0-0 libcap2 libcolord2 libcryptsetup12
  libdconf1 libdevmapper1.02.1 libdrm-amdgpu1 libdrm-com

# Creación de DataFrames

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, StructField # Estructura del dataframe
from pyspark.sql.types import StringType, IntegerType, FloatType
from pyspark.sql.types import Row # formatos para las columnas

from pyspark.sql import SQLContext

In [3]:
spark = SparkContext(master='local', appName='DataFrames')
sqlContext = SQLContext(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/28 15:52:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
!head -5 '/data/juegos.csv'

,nombre_juego,annio,temporada,ciudad
1,1896 Verano,1896,Verano,Athina
2,1900 Verano,1900,Verano,Paris
3,1904 Verano,1904,Verano,St. Louis
4,1906 Verano,1906,Verano,Athina


In [5]:
play_schema = StructType([
    StructField('play_id', IntegerType(), False), # false: campo obligatorio
    StructField('Age', StringType(), False), 
    StructField('Season', StringType(), False), 
    StructField('City', StringType(), False), 
])

# 
play_df = sqlContext.read.schema(play_schema).option('header', 'true', ).csv('/data/juegos.csv')

In [6]:
play_df.show(4)

23/04/28 15:52:45 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 5, schema size: 4
CSV file: file:///work/juegos.csv
+-------+-----------+------+------+
|play_id|        Age|Season|  City|
+-------+-----------+------+------+
|      1|1896 Verano|  1896|Verano|
|      2|1900 Verano|  1900|Verano|
|      3|1904 Verano|  1904|Verano|
|      4|1906 Verano|  1906|Verano|
+-------+-----------+------+------+
only showing top 4 rows



In [7]:
spark

# Inferencia de tipos de datos

In [8]:
olympics_athletes_rdd1 = spark.textFile('/data/deportista.csv').map(
    lambda line: line.split(',')
)

olympics_athletes_rdd2 = spark.textFile('/data/deportista2.csv').map(
    lambda line: line.split(',')
)

olympics_athletes_rdd = olympics_athletes_rdd1.union(olympics_athletes_rdd2)

In [9]:
olympics_athletes_rdd.take(5)



[['deportista_id', 'nombre', 'genero', 'edad', 'altura', 'peso', 'equipo_id'],
 ['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '0', '0', '273'],
 ['4', 'Edgar Lindenau Aabye', '1', '34', '0', '0', '278']]

In [10]:
def remove_header(index, iterator):
    return iter(list(iterator)[1:])

In [11]:
olympics_athletes_rdd = olympics_athletes_rdd.mapPartitionsWithIndex(remove_header)

In [12]:
olympics_athletes_rdd.take(5)

[['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '0', '0', '273'],
 ['4', 'Edgar Lindenau Aabye', '1', '34', '0', '0', '278'],
 ['5', 'Christine Jacoba Aaftink', '2', '21', '185', '82', '705']]

In [13]:

olympics_athletes_rdd = olympics_athletes_rdd.map(lambda l: (
    int(l[0]),
    l[1],
    int(l[2]),
    int(l[3]),
    int(l[4]),
    float(l[5]),
    int(l[0])
))

In [14]:
athletes_schema = StructType([
    StructField('athletes_id', IntegerType(), False),
    StructField('name', StringType(), False), 
    StructField('gender', IntegerType(), False),
    StructField('age', IntegerType(), False), 
    StructField('height', IntegerType(), False), 
    StructField('weight', FloatType(), False), 
    StructField('team_id', IntegerType(), False)
])

In [15]:
athletes_df = sqlContext.createDataFrame(olympics_athletes_rdd, athletes_schema)
athletes_df.show(5)

+-----------+--------------------+------+---+------+------+-------+
|athletes_id|                name|gender|age|height|weight|team_id|
+-----------+--------------------+------+---+------+------+-------+
|          1|           A Dijiang|     1| 24|   180|  80.0|      1|
|          2|            A Lamusi|     1| 23|   170|  60.0|      2|
|          3| Gunnar Nielsen Aaby|     1| 24|     0|   0.0|      3|
|          4|Edgar Lindenau Aabye|     1| 34|     0|   0.0|      4|
|          5|Christine Jacoba ...|     2| 21|   185|  82.0|      5|
+-----------+--------------------+------+---+------+------+-------+
only showing top 5 rows



In [16]:
!head -5 '/data/paises.csv'

id,equipo,sigla
1,30. Februar,AUT
2,A North American Team,MEX
3,Acipactli,MEX
4,Acturus,ARG


In [17]:
country_rdd = spark.textFile('/data/paises.csv').map(
    lambda line: line.split(',')
)
country_rdd = country_rdd.mapPartitionsWithIndex(remove_header)
country_rdd = country_rdd.map(lambda l: (
    int(l[0]),
    l[1],
    l[2]
))
country_schema = StructType([
    StructField('country_id', IntegerType(), False),
    StructField('team', StringType(), False), 
    StructField('code_country', StringType(), False),
])
country_df = sqlContext.createDataFrame(country_rdd, country_schema)
country_df.show(5)

+----------+--------------------+------------+
|country_id|                team|code_country|
+----------+--------------------+------------+
|         1|         30. Februar|         AUT|
|         2|A North American ...|         MEX|
|         3|           Acipactli|         MEX|
|         4|             Acturus|         ARG|
|         5|         Afghanistan|         AFG|
+----------+--------------------+------------+
only showing top 5 rows



In [18]:
!head -5 '/data/juegos.csv'

,nombre_juego,annio,temporada,ciudad
1,1896 Verano,1896,Verano,Athina
2,1900 Verano,1900,Verano,Paris
3,1904 Verano,1904,Verano,St. Louis
4,1906 Verano,1906,Verano,Athina


In [19]:
play_rdd = spark.textFile('/data/juegos.csv').map(
    lambda line: line.split(',')
)
play_rdd = play_rdd.mapPartitionsWithIndex(remove_header)
play_rdd = play_rdd.map(lambda l: (
    int(l[0]),
    l[1],
    int(l[2]),
    l[3],
    l[4]
))
play_schema = StructType([
    StructField('play_id', IntegerType(), False),
    StructField('name', StringType(), False),
    StructField('age', IntegerType(), False), 
    StructField('season', StringType(), False), 
    StructField('city', StringType(), False), 
])
play_df = sqlContext.createDataFrame(play_rdd, play_schema)
play_df.show(5)

+-------+-----------+----+------+---------+
|play_id|       name| age|season|     city|
+-------+-----------+----+------+---------+
|      1|1896 Verano|1896|Verano|   Athina|
|      2|1900 Verano|1900|Verano|    Paris|
|      3|1904 Verano|1904|Verano|St. Louis|
|      4|1906 Verano|1906|Verano|   Athina|
|      5|1908 Verano|1908|Verano|   London|
+-------+-----------+----+------+---------+
only showing top 5 rows



In [20]:
!head -5 '/data/deporte.csv'

deporte_id,deporte
1,Basketball
2,Judo
3,Football
4,Tug-Of-War


In [21]:
sport_rdd = spark.textFile('/data/deporte.csv').map(
    lambda line: line.split(',')
)
sport_rdd = sport_rdd.mapPartitionsWithIndex(remove_header)
sport_rdd = sport_rdd.map(lambda l: (
    int(l[0]),
    l[1]
))
sport_schema = StructType([
    StructField('sport_id', IntegerType(), False),
    StructField('name', StringType(), False) 
])
sport_df = sqlContext.createDataFrame(sport_rdd, sport_schema)
sport_df.show(5)

+--------+-------------+
|sport_id|         name|
+--------+-------------+
|       1|   Basketball|
|       2|         Judo|
|       3|     Football|
|       4|   Tug-Of-War|
|       5|Speed Skating|
+--------+-------------+
only showing top 5 rows



In [22]:
!head -5 '/data/evento.csv'

evento_id,evento,deporte_id
1,Basketball Men's Basketball,1
2,Judo Men's Extra-Lightweight,2
3,Football Men's Football,3
4,Tug-Of-War Men's Tug-Of-War,4


In [23]:
# event_rdd = spark.textFile('/data/evento.csv').map(
#     lambda line: line.split(',')
# )
# event_rdd = event_rdd.mapPartitionsWithIndex(remove_header)
# event_rdd = event_rdd.map(lambda l: (
#     int(l[0]),
#     l[1],
#     int(l[2])
# ))
event_schema = StructType([
    StructField('event_id', IntegerType(), False),
    StructField('name', StringType(), False),
    StructField('sport_id', IntegerType(), False)
])
# event_df = sqlContext.createDataFrame(event_rdd, event_schema)
event_df = sqlContext.read.schema(event_schema).option('header', 'true').csv('/data/evento.csv')
event_df.show(5)

+--------+--------------------+--------+
|event_id|                name|sport_id|
+--------+--------------------+--------+
|       1|Basketball Men's ...|       1|
|       2|Judo Men's Extra-...|       2|
|       3|Football Men's Fo...|       3|
|       4|Tug-Of-War Men's ...|       4|
|       5|Speed Skating Wom...|       5|
+--------+--------------------+--------+
only showing top 5 rows

23/04/28 15:52:52 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: evento_id, evento, deporte_id
 Schema: event_id, name, sport_id
Expected: event_id but found: evento_id
CSV file: file:///work/evento.csv


In [24]:
!head -5 '/data/resultados.csv'

resultado_id,medalla,deportista_id,juego_id,evento_id
1,NA,1,39,1
2,NA,2,49,2
3,NA,3,7,3
4,Gold,4,2,4


In [25]:
# result_rdd = spark.textFile('/data/resultados.csv').map(
#     lambda line: line.split(',')
# )
# result_rdd = result_rdd.mapPartitionsWithIndex(remove_header)
# result_rdd = result_rdd.map(lambda l: (
#     int(l[0]),
#     l[1],
#     int(l[2]),
#     int(l[3]),
#     int(l[4])
# ))
result_schema = StructType([
    StructField('result_id', IntegerType(), False),
    StructField('medals', StringType(), False),
    StructField('athletes_id', IntegerType(), False),
    StructField('play_id', IntegerType(), False),
    StructField('event_id', IntegerType(), False)
])
# result_df = sqlContext.createDataFrame(result_rdd, result_schema)
result_df = sqlContext.read.schema(result_schema).option('header', 'true').csv('/data/resultados.csv')
result_df.show(5)

+---------+------+-----------+-------+--------+
|result_id|medals|athletes_id|play_id|event_id|
+---------+------+-----------+-------+--------+
|        1|    NA|          1|     39|       1|
|        2|    NA|          2|     49|       2|
|        3|    NA|          3|      7|       3|
|        4|  Gold|          4|      2|       4|
|        5|    NA|          5|     36|       5|
+---------+------+-----------+-------+--------+
only showing top 5 rows

23/04/28 15:52:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: resultado_id, medalla, deportista_id, juego_id, evento_id
 Schema: result_id, medals, athletes_id, play_id, event_id
Expected: result_id but found: resultado_id
CSV file: file:///work/resultados.csv


# Operaciones sobre DF

In [26]:
# ver el esquema de un dataframe
sport_df.printSchema()

root
 |-- sport_id: integer (nullable = false)
 |-- name: string (nullable = false)



In [27]:
athletes_df.printSchema()

root
 |-- athletes_id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- gender: integer (nullable = false)
 |-- age: integer (nullable = false)
 |-- height: integer (nullable = false)
 |-- weight: float (nullable = false)
 |-- team_id: integer (nullable = false)



In [28]:
# renombrar una columna y eliminar una columna de un dataframe
athletes_df2 = athletes_df.withColumnRenamed("gender", "gen").drop("height")

In [29]:
athletes_df2.printSchema()

root
 |-- athletes_id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- gen: integer (nullable = false)
 |-- age: integer (nullable = false)
 |-- weight: float (nullable = false)
 |-- team_id: integer (nullable = false)



In [30]:
from pyspark.sql.functions import *

In [31]:
athletes_df2 = athletes_df2.select("athletes_id", "name",
                    col("age").alias('current_age'), # col: para no recorrer el df cuando se haga la operacion de renombrar
                    "team_id" ) 

In [32]:
athletes_df2.show(5)

+-----------+--------------------+-----------+-------+
|athletes_id|                name|current_age|team_id|
+-----------+--------------------+-----------+-------+
|          1|           A Dijiang|         24|      1|
|          2|            A Lamusi|         23|      2|
|          3| Gunnar Nielsen Aaby|         24|      3|
|          4|Edgar Lindenau Aabye|         34|      4|
|          5|Christine Jacoba ...|         21|      5|
+-----------+--------------------+-----------+-------+
only showing top 5 rows



In [33]:
athletes_df2.filter((athletes_df2.current_age != 0)).sort("current_age").show()

+-----------+--------------------+-----------+-------+
|athletes_id|                name|current_age|team_id|
+-----------+--------------------+-----------+-------+
|      71691|  Dimitrios Loundras|         10|  71691|
|      22411|Magdalena Cecilia...|         11|  22411|
|      70616|          Liu Luyang|         11|  70616|
|      37333|Carlos Bienvenido...|         11|  37333|
|      76675|   Marcelle Matthews|         11|  76675|
|      40129|    Luigina Giavotti|         11|  40129|
|     118925|Megan Olwen Deven...|         11| 118925|
|      47618|Sonja Henie Toppi...|         11|  47618|
|     126307|        Liana Vicens|         11| 126307|
|      51268|      Beatrice Hutiu|         11|  51268|
|      52070|        Etsuko Inada|         11|  52070|
|      72854|      Licia Macchini|         12|  72854|
|       5291|Marcia Arriaga La...|         12|   5291|
|      74712|     Carla Marangoni|         12|  74712|
|      24191| Philippe Cuelenaere|         12|  24191|
|      747

# Agrupaciones y operaciones join sobre DF

In [34]:
result_df.printSchema()

root
 |-- result_id: integer (nullable = true)
 |-- medals: string (nullable = true)
 |-- athletes_id: integer (nullable = true)
 |-- play_id: integer (nullable = true)
 |-- event_id: integer (nullable = true)



In [35]:
play_df.printSchema()

root
 |-- play_id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)
 |-- season: string (nullable = false)
 |-- city: string (nullable = false)



In [36]:
event_df.printSchema()

root
 |-- event_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- sport_id: integer (nullable = true)



## Join

In [37]:
athletes_df.join(
    result_df, 
    athletes_df.athletes_id == result_df.athletes_id,
    "left"
).join(
    play_df,
    play_df.play_id == result_df.play_id,
    "left"
).join(
    event_df,
    event_df.event_id == result_df.event_id,
    "left"
).select(
    athletes_df.name.alias('player_name'),
    athletes_df.age.alias('age_athletes'),
    result_df.medals,
    play_df.age.alias('age_play'),
    athletes_df.age.alias('age_athletes'),
    event_df.name.alias('sport_name')
).show(5)

23/04/28 15:52:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: evento_id, evento
 Schema: event_id, name
Expected: event_id but found: evento_id
CSV file: file:///work/evento.csv
23/04/28 15:52:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: medalla, deportista_id, juego_id, evento_id
 Schema: medals, athletes_id, play_id, event_id
Expected: medals but found: medalla
CSV file: file:///work/resultados.csv
+--------------------+------------+------+--------+------------+--------------------+
|         player_name|age_athletes|medals|age_play|age_athletes|          sport_name|
+--------------------+------------+------+--------+------------+--------------------+
|           A Dijiang|          24|    NA|    1992|          24|Basketball Men's ...|
|            A Lamusi|          23|    NA|    2012|          23|Judo Men's Extra-...|
| Gunnar Nielsen Aaby|          24|    NA|    1920|          24|Football Men's Fo...|
|Edgar Lindenau 

In [38]:
# medallas ganadoras, junto al pais y equipo

result_with_medal_df = result_df.filter(result_df.medals != "NA")
result_with_medal_df.show(5)

+---------+------+-----------+-------+--------+
|result_id|medals|athletes_id|play_id|event_id|
+---------+------+-----------+-------+--------+
|        4|  Gold|          4|      2|       4|
|       38|Bronze|         15|      7|      19|
|       39|Bronze|         15|      7|      20|
|       41|Bronze|         16|     50|      14|
|       42|Bronze|         17|     17|      21|
+---------+------+-----------+-------+--------+
only showing top 5 rows

23/04/28 15:53:02 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: resultado_id, medalla, deportista_id, juego_id, evento_id
 Schema: result_id, medals, athletes_id, play_id, event_id
Expected: result_id but found: resultado_id
CSV file: file:///work/resultados.csv


In [39]:
result_with_medal_df.join(
    athletes_df,
    athletes_df.athletes_id == result_with_medal_df.athletes_id,
    "left",

).join(
    country_df,
    country_df.country_id == athletes_df.team_id,
    "left"
).select("medals", "team", "code_country").sort(col("code_country").desc()).show(5)

23/04/28 15:53:03 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: medalla, deportista_id
 Schema: medals, athletes_id
Expected: medals but found: medalla
CSV file: file:///work/resultados.csv
+------+-------------+------------+
|medals|         team|code_country|
+------+-------------+------------+
|  Gold|       Zambia|         ZAM|
|  Gold|       Zambia|         ZAM|
|  Gold|       Zambia|         ZAM|
|Silver|     Konstanz|         YUG|
|Bronze|South Vietnam|         VNM|
+------+-------------+------------+
only showing top 5 rows



## Funciones de agrupación

In [40]:
winner_by_year_df = athletes_df \
    .join(
        result_df, 
        athletes_df.athletes_id == result_df.athletes_id, 
        "left"
    ) \
    .join(
        play_df,
        play_df.play_id == result_df.play_id,
        "left"
    ) \
    .join(
        country_df,
        country_df.country_id == athletes_df.team_id,
        "left"
    ) \
    .join(
        event_df,
        event_df.event_id == result_df.event_id,
        "left"
    ) \
    .join(
        sport_df, 
        event_df.sport_id == sport_df.sport_id, 
        "left"
    ) \
    .select(
        "code_country",
        play_df.name.alias("anio"),
        "medals",
        event_df.name.alias("Nombre subdisciplina"),
        sport_df.name.alias("Nombre disciplina"),
        athletes_df.name.alias("nombre")    
    )

winner_by_year_df.show()

23/04/28 15:53:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: medalla, deportista_id, juego_id, evento_id
 Schema: medals, athletes_id, play_id, event_id
Expected: medals but found: medalla
CSV file: file:///work/resultados.csv
23/04/28 15:53:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: evento_id, evento, deporte_id
 Schema: event_id, name, sport_id
Expected: event_id but found: evento_id
CSV file: file:///work/evento.csv
+------------+-------------+------+--------------------+--------------------+--------------------+
|code_country|         anio|medals|Nombre subdisciplina|   Nombre disciplina|              nombre|
+------------+-------------+------+--------------------+--------------------+--------------------+
|         AUT|  1992 Verano|    NA|Basketball Men's ...|          Basketball|           A Dijiang|
|         MEX|  2012 Verano|    NA|Judo Men's Extra-...|                Judo|            A Lamusi|
|         MEX| 

In [41]:
winner_by_year_df_group = winner_by_year_df.filter(winner_by_year_df.medals != "NA").sort("anio").groupBy(
    "code_country", 'anio', 'Nombre subdisciplina'
    ).count()

In [42]:
winner_by_year_df_group.printSchema()

root
 |-- code_country: string (nullable = true)
 |-- anio: string (nullable = true)
 |-- Nombre subdisciplina: string (nullable = true)
 |-- count: long (nullable = false)



In [43]:
winner_by_year_df_group.groupBy("code_country", "anio") \
    .agg(sum("count").alias("Total de medallas"),
    avg("count").alias("Promedio medellas")).show()

23/04/28 15:53:11 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: evento_id, evento, deporte_id
 Schema: event_id, name, sport_id
Expected: event_id but found: evento_id
CSV file: file:///work/evento.csv
23/04/28 15:53:11 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: medalla, deportista_id, juego_id, evento_id
 Schema: medals, athletes_id, play_id, event_id
Expected: medals but found: medalla
CSV file: file:///work/resultados.csv
+------------+-------------+-----------------+-----------------+
|code_country|         anio|Total de medallas|Promedio medellas|
+------------+-------------+-----------------+-----------------+
|         NED|1992 Invierno|                2|              1.0|
|         SOM|  1952 Verano|                1|              1.0|
|         DEN|  1956 Verano|                1|              1.0|
|         GRE|  1948 Verano|                1|              1.0|
|         SWE|  1932 Verano|                1|         

# SQL

In [44]:
# registrar el DF como tabla adicional
result_df.createOrReplaceTempView("resultado")
athletes_df.createOrReplaceTempView("deportista")
country_df.createOrReplaceTempView('paises')

In [48]:
sqlContext.sql("SELECT * FROM deportista").show(5)

+-----------+--------------------+------+---+------+------+-------+
|athletes_id|                name|gender|age|height|weight|team_id|
+-----------+--------------------+------+---+------+------+-------+
|          1|           A Dijiang|     1| 24|   180|  80.0|      1|
|          2|            A Lamusi|     1| 23|   170|  60.0|      2|
|          3| Gunnar Nielsen Aaby|     1| 24|     0|   0.0|      3|
|          4|Edgar Lindenau Aabye|     1| 34|     0|   0.0|      4|
|          5|Christine Jacoba ...|     2| 21|   185|  82.0|      5|
+-----------+--------------------+------+---+------+------+-------+
only showing top 5 rows



In [45]:
sqlContext.sql("SELECT * FROM paises").show(5)

+----------+--------------------+------------+
|country_id|                team|code_country|
+----------+--------------------+------------+
|         1|         30. Februar|         AUT|
|         2|A North American ...|         MEX|
|         3|           Acipactli|         MEX|
|         4|             Acturus|         ARG|
|         5|         Afghanistan|         AFG|
+----------+--------------------+------------+
only showing top 5 rows



In [46]:
sqlContext.sql("SELECT * FROM resultado").show(5)

+---------+------+-----------+-------+--------+
|result_id|medals|athletes_id|play_id|event_id|
+---------+------+-----------+-------+--------+
|        1|    NA|          1|     39|       1|
|        2|    NA|          2|     49|       2|
|        3|    NA|          3|      7|       3|
|        4|  Gold|          4|      2|       4|
|        5|    NA|          5|     36|       5|
+---------+------+-----------+-------+--------+
only showing top 5 rows

23/04/28 15:53:16 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: resultado_id, medalla, deportista_id, juego_id, evento_id
 Schema: result_id, medals, athletes_id, play_id, event_id
Expected: result_id but found: resultado_id
CSV file: file:///work/resultados.csv


In [50]:
sqlContext.sql("""SELECT medals, team, code_country
                    FROM resultado r
                    JOIN deportista d
                      ON r.athletes_id = d.athletes_id
                    JOIN paises p
                      ON p.country_id = d.team_id
                   WHERE medals <> "NA"
                   ORDER BY code_country DESC
                    """).show()

23/04/28 15:56:20 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: medalla, deportista_id
 Schema: medals, athletes_id
Expected: medals but found: medalla
CSV file: file:///work/resultados.csv
+------+--------------------+------------+
|medals|                team|code_country|
+------+--------------------+------------+
|  Gold|              Zambia|         ZAM|
|  Gold|              Zambia|         ZAM|
|  Gold|              Zambia|         ZAM|
|Silver|            Konstanz|         YUG|
|Bronze|       South Vietnam|         VNM|
|Bronze|"South St Louis T...|         USA|
|  Gold|Foxhunters Hurlin...|         USA|
|Silver|             Jupiter|         USA|
|  Gold|            Margaret|         USA|
|Bronze|Missouri Athletic...|         USA|
|Silver|              Rhythm|         USA|
|Silver|    United States-12|         USA|
|Bronze|     United States-2|         USA|
|Bronze|     United States-4|         USA|
|  Gold|     United States-4|         USA|
|Silver