In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.types import Row
from pyspark.sql import SQLContext
from pyspark.sql.functions import *

# Project
## Start sessions

In [2]:
spark = SparkContext(master='local', appName='dataframes')
sql = SQLContext(spark)

23/04/11 06:52:22 WARN Utils: Your hostname, PRRS-DCARVAJAL resolves to a loopback address: 127.0.1.1; using 172.30.122.149 instead (on interface eth0)
23/04/11 06:52:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/11 06:52:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable




## Load data

In [3]:
ls ./files/

deporte.csv      deportistaError.csv  [0m[01;35mmodelo_relacional.jpg[0m
deportista.csv   evento.csv           paises.csv
deportista2.csv  juegos.csv           resultados.csv


In [4]:
path = './files/'

In [5]:
def remove_header(index, iterator):
    return iter(list(iterator)[1:])

In [6]:
# deporte.csv
deporte_rdd = spark.textFile(path+'deporte.csv').map(lambda l:l.split(','))
deporte_rdd = deporte_rdd.mapPartitionsWithIndex(remove_header)
deporte_rdd = deporte_rdd.map(lambda l:(int(l[0]), l[1]))
schema = StructType([StructField('deporte_id', IntegerType(),False), StructField('deporte', StringType(),False)])
deporte_df = sql.createDataFrame(deporte_rdd, schema)
deporte_df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+----------+-------------+
|deporte_id|      deporte|
+----------+-------------+
|         1|   Basketball|
|         2|         Judo|
|         3|     Football|
|         4|   Tug-Of-War|
|         5|Speed Skating|
+----------+-------------+
only showing top 5 rows



                                                                                

In [7]:
# evento.csv
schema = StructType([StructField('evento_id', IntegerType(),False), StructField('evento', StringType(),False),  StructField('deporte_id', IntegerType(),False)])
evento_df = sql.read.schema(schema).option('header','true').csv(path+'/evento.csv')
evento_df.show(5)

+---------+--------------------+----------+
|evento_id|              evento|deporte_id|
+---------+--------------------+----------+
|        1|Basketball Men's ...|         1|
|        2|Judo Men's Extra-...|         2|
|        3|Football Men's Fo...|         3|
|        4|Tug-Of-War Men's ...|         4|
|        5|Speed Skating Wom...|         5|
+---------+--------------------+----------+
only showing top 5 rows



In [8]:
# juegos.csv
schema = StructType([StructField('juego_id', IntegerType(),False),
                          StructField('annio', StringType(),False),
                          StructField('temporada', StringType(),False),
                          StructField('ciudad', StringType(),False)])
juegos_df = sql.read.schema(schema).option('header','true').csv(path+'/juegos.csv')
juegos_df.show(5)

23/04/11 06:52:35 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 5, schema size: 4
CSV file: file:///home/daielchom/projects/spark_notebooks/files/juegos.csv
+--------+-----------+---------+------+
|juego_id|      annio|temporada|ciudad|
+--------+-----------+---------+------+
|       1|1896 Verano|     1896|Verano|
|       2|1900 Verano|     1900|Verano|
|       3|1904 Verano|     1904|Verano|
|       4|1906 Verano|     1906|Verano|
|       5|1908 Verano|     1908|Verano|
+--------+-----------+---------+------+
only showing top 5 rows



In [9]:
# paises.csv
schema = StructType([StructField("paises_id", IntegerType(), False), StructField("equipo", StringType(), False), StructField("sigla", StringType(), False)])
paises_df = sql.read.schema(schema).option('header','true').csv(path+'/paises.csv')
paises_df.show(5)

23/04/11 06:52:36 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, equipo, sigla
 Schema: paises_id, equipo, sigla
Expected: paises_id but found: id
CSV file: file:///home/daielchom/projects/spark_notebooks/files/paises.csv
+---------+--------------------+-----+
|paises_id|              equipo|sigla|
+---------+--------------------+-----+
|        1|         30. Februar|  AUT|
|        2|A North American ...|  MEX|
|        3|           Acipactli|  MEX|
|        4|             Acturus|  ARG|
|        5|         Afghanistan|  AFG|
+---------+--------------------+-----+
only showing top 5 rows



In [10]:
# resultados.csv
schema = StructType([StructField("resultado_id", IntegerType(), False), StructField("medalla", StringType(), False), StructField("deportista_id", IntegerType(), False), StructField("juego_id", IntegerType(), False), StructField("evento_id", IntegerType(), False)])
resultados_df = sql.read.schema(schema).option('header','true').csv(path+'/resultados.csv')
resultados_df.show(5)

+------------+-------+-------------+--------+---------+
|resultado_id|medalla|deportista_id|juego_id|evento_id|
+------------+-------+-------------+--------+---------+
|           1|     NA|            1|      39|        1|
|           2|     NA|            2|      49|        2|
|           3|     NA|            3|       7|        3|
|           4|   Gold|            4|       2|        4|
|           5|     NA|            5|      36|        5|
+------------+-------+-------------+--------+---------+
only showing top 5 rows



In [11]:
# deportista.csv
schema = StructType([StructField("deportista_id", IntegerType(), False), StructField("nombre", StringType(), False), StructField("genero", IntegerType(), False), StructField("edad", IntegerType(), False), StructField("altura", IntegerType(), False), StructField("peso", IntegerType(), False), StructField("equipo_id", IntegerType(), False)])
df1 = sql.read.schema(schema).option('header','true').csv(path+'/deportista.csv')
df2 = sql.read.schema(schema).option('header','true').csv(path+'/deportista2.csv')
deportista_df = df1.union(df2)
deportista_df.show(5)

+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|  80|      199|
|            2|            A Lamusi|     1|  23|   170|  60|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|     0|   0|      273|
|            4|Edgar Lindenau Aabye|     1|  34|     0|   0|      278|
|            5|Christine Jacoba ...|     2|  21|   185|  82|      705|
+-------------+--------------------+------+----+------+----+---------+
only showing top 5 rows



## Explore

In [12]:
deporte_df.printSchema()

root
 |-- deporte_id: integer (nullable = false)
 |-- deporte: string (nullable = false)



In [13]:
deportista_df.printSchema()

root
 |-- deportista_id: integer (nullable = true)
 |-- nombre: string (nullable = true)
 |-- genero: integer (nullable = true)
 |-- edad: integer (nullable = true)
 |-- altura: integer (nullable = true)
 |-- peso: integer (nullable = true)
 |-- equipo_id: integer (nullable = true)



In [14]:
deportista_df = deportista_df.withColumnRenamed('genero', 'sexo').drop('altura')

In [15]:
deportista_df.printSchema()

root
 |-- deportista_id: integer (nullable = true)
 |-- nombre: string (nullable = true)
 |-- sexo: integer (nullable = true)
 |-- edad: integer (nullable = true)
 |-- peso: integer (nullable = true)
 |-- equipo_id: integer (nullable = true)



In [16]:
deportista_df = deportista_df.select('deportista_id','nombre', col('edad').alias('edad_al_jugar'), 'equipo_id')

In [17]:
deportista_df.show(5)

+-------------+--------------------+-------------+---------+
|deportista_id|              nombre|edad_al_jugar|equipo_id|
+-------------+--------------------+-------------+---------+
|            1|           A Dijiang|           24|      199|
|            2|            A Lamusi|           23|      199|
|            3| Gunnar Nielsen Aaby|           24|      273|
|            4|Edgar Lindenau Aabye|           34|      278|
|            5|Christine Jacoba ...|           21|      705|
+-------------+--------------------+-------------+---------+
only showing top 5 rows



In [18]:
deportista_df.sort('edad_al_jugar').show(5)



23/04/10 05:53:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 67787, Lee BongJu, 27, 970
 Schema: deportista_id, nombre, edad, equipo_id
Expected: deportista_id but found: 67787
CSV file: file:///home/daielchom/projects/spark_notebooks/files/deportista2.csv
+-------------+--------------------+-------------+---------+
|deportista_id|              nombre|edad_al_jugar|equipo_id|
+-------------+--------------------+-------------+---------+
|          133|           Franz Abb|            0|      399|
|          167|Ould Lamine Abdallah|            0|      362|
|           66|     Mohamed Abakkar|            0|     1003|
|          163|     Ismail Abdallah|            0|     1095|
|          139|George Ioannis Abbot|            0|     1043|
+-------------+--------------------+-------------+---------+
only showing top 5 rows



                                                                                

In [19]:
deportista_df = deportista_df.filter(deportista_df.edad_al_jugar!=0)
deportista_df.sort('edad_al_jugar').show(5)

23/04/10 05:53:39 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 67787, Lee BongJu, 27, 970
 Schema: deportista_id, nombre, edad, equipo_id
Expected: deportista_id but found: 67787
CSV file: file:///home/daielchom/projects/spark_notebooks/files/deportista2.csv




+-------------+--------------------+-------------+---------+
|deportista_id|              nombre|edad_al_jugar|equipo_id|
+-------------+--------------------+-------------+---------+
|        71691|  Dimitrios Loundras|           10|      333|
|        52070|        Etsuko Inada|           11|      514|
|        40129|    Luigina Giavotti|           11|      507|
|        37333|Carlos Bienvenido...|           11|      982|
|        47618|Sonja Henie Toppi...|           11|      742|
+-------------+--------------------+-------------+---------+
only showing top 5 rows



### Class 16

In [20]:
deportista_df.printSchema()

root
 |-- deportista_id: integer (nullable = true)
 |-- nombre: string (nullable = true)
 |-- edad_al_jugar: integer (nullable = true)
 |-- equipo_id: integer (nullable = true)



In [21]:
resultados_df.printSchema()

root
 |-- resultado_id: integer (nullable = true)
 |-- medalla: string (nullable = true)
 |-- deportista_id: integer (nullable = true)
 |-- juego_id: integer (nullable = true)
 |-- evento_id: integer (nullable = true)



In [22]:
juegos_df.printSchema()

root
 |-- juego_id: integer (nullable = true)
 |-- annio: string (nullable = true)
 |-- temporada: string (nullable = true)
 |-- ciudad: string (nullable = true)



In [24]:
deporte_df.printSchema()

root
 |-- deporte_id: integer (nullable = false)
 |-- deporte: string (nullable = false)



In [31]:
deportista_df.join(
    resultados_df,
    deportista_df.deportista_id == resultados_df.deportista_id,
    'left').join(
    juegos_df,
    juegos_df.juego_id == resultados_df.juego_id,
    'left').join(
    evento_df,
    evento_df.evento_id == resultados_df.evento_id,
    'left').select('nombre', 'edad', 'medalla', 'annio', evento_df.evento.alias('disiplina')).show()

[Stage 10:>   (0 + 1) / 1][Stage 11:>   (0 + 0) / 1][Stage 12:>   (0 + 0) / 1]                                                                                

23/04/10 06:03:44 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , nombre_juego
 Schema: juego_id, annio
Expected: juego_id but found: 
CSV file: file:///home/daielchom/projects/spark_notebooks/files/juegos.csv
+--------------------+----+-------+-------------+--------------------+
|              nombre|edad|medalla|        annio|           disiplina|
+--------------------+----+-------+-------------+--------------------+
|           A Dijiang|  24|     NA|  1992 Verano|Basketball Men's ...|
|            A Lamusi|  23|     NA|  2012 Verano|Judo Men's Extra-...|
| Gunnar Nielsen Aaby|  24|     NA|  1920 Verano|Football Men's Fo...|
|Edgar Lindenau Aabye|  34|   Gold|  1900 Verano|Tug-Of-War Men's ...|
|Christine Jacoba ...|  21|     NA|1994 Invierno|Speed Skating Wom...|
|Christine Jacoba ...|  21|     NA|1994 Invierno|Speed Skating Wom...|
|Christine Jacoba ...|  21|     NA|1992 Invierno|Speed Skating Wom...|
|Christine Jacoba ...|  21|     NA|1992 Invierno|Sp

### Class 17

In [34]:
resultados_df.filter(resultados_df.medalla!='NA').join(
    deportista_df,
    deportista_df.deportista_id == resultados_df.deportista_id,
    'left').join(
    paises_df,
    paises_df.paises_id == deportista_df.equipo_id,
    'left').select('medalla', 'equipo', 'sigla').sort(col('sigla').desc()).show()

23/04/10 06:08:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, equipo, sigla
 Schema: paises_id, equipo, sigla
Expected: paises_id but found: id
CSV file: file:///home/daielchom/projects/spark_notebooks/files/paises.csv
23/04/10 06:08:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 67787, 970
 Schema: deportista_id, equipo_id
Expected: deportista_id but found: 67787
CSV file: file:///home/daielchom/projects/spark_notebooks/files/deportista2.csv




+-------+--------+-----+
|medalla|  equipo|sigla|
+-------+--------+-----+
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
| Bronze|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
+-------+--------+-----+
only showing top 20 rows



[Stage 19:>                                                         (0 + 1) / 1]                                                                                

### Class 18

In [12]:
medallistaXAnio = deportista_df \
    .join(
        resultados_df, 
        deportista_df.deportista_id == resultados_df.deportista_id, 
        "left"
    ) \
    .join(
        juegos_df, 
        juegos_df.juego_id == resultados_df.juego_id, 
        "left"
    ) \
    .join(
        paises_df, 
        deportista_df.equipo_id == paises_df.paises_id, 
        "left"
    ) \
    .join(
        evento_df, 
        evento_df.evento_id == resultados_df.evento_id, 
        "left"
    ) \
    .join(
        deporte_df, 
        evento_df.deporte_id == deporte_df.deporte_id, 
        "left"
    ) \
    .select(
        "sigla",
        "annio",
        "medalla",
        'evento',
        deporte_df.deporte.alias("Nombre disciplina"),
        deportista_df.nombre    
    )

medallistaXAnio.show()

                                                                                

23/04/11 06:52:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , nombre_juego
 Schema: juego_id, annio
Expected: juego_id but found: 
CSV file: file:///home/daielchom/projects/spark_notebooks/files/juegos.csv
23/04/11 06:52:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, sigla
 Schema: paises_id, sigla
Expected: paises_id but found: id
CSV file: file:///home/daielchom/projects/spark_notebooks/files/paises.csv
+-----+-------------+-------+--------------------+--------------------+--------------------+
|sigla|        annio|medalla|              evento|   Nombre disciplina|              nombre|
+-----+-------------+-------+--------------------+--------------------+--------------------+
|  CHN|  1992 Verano|     NA|Basketball Men's ...|          Basketball|           A Dijiang|
|  CHN|  2012 Verano|     NA|Judo Men's Extra-...|                Judo|            A Lamusi|
|  DEN|  1920 Verano|     NA|Football Men's Fo...|        

In [49]:
medallista_annio = medallistaXAnio.filter(medallistaXAnio.medalla!='NA').sort('annio').groupBy('sigla','annio','evento').count()

In [50]:
medallista_annio.printSchema()

root
 |-- sigla: string (nullable = true)
 |-- annio: string (nullable = true)
 |-- evento: string (nullable = true)
 |-- count: long (nullable = false)



In [52]:
medallista_annio.groupBy('sigla', 'annio').agg(sum('count').alias('total medallas'), avg('count').alias('avg medallas')).sort('annio').show()

23/04/10 06:24:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , nombre_juego
 Schema: juego_id, annio
Expected: juego_id but found: 
CSV file: file:///home/daielchom/projects/spark_notebooks/files/juegos.csv
23/04/10 06:24:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, sigla
 Schema: paises_id, sigla
Expected: paises_id but found: id
CSV file: file:///home/daielchom/projects/spark_notebooks/files/paises.csv
23/04/10 06:24:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 67787, 970
 Schema: deportista_id, equipo_id
Expected: deportista_id but found: 67787
CSV file: file:///home/daielchom/projects/spark_notebooks/files/deportista2.csv




+-----+-----------+--------------+------------------+
|sigla|      annio|total medallas|      avg medallas|
+-----+-----------+--------------+------------------+
|  DEN|1896 Verano|             6|               1.0|
|  FRA|1896 Verano|            11|             1.375|
|  GER|1896 Verano|            32|2.6666666666666665|
|  SUI|1896 Verano|             3|               1.0|
|  HUN|1896 Verano|             6|               1.0|
|  AUS|1896 Verano|             3|               1.0|
|  GRE|1896 Verano|            48|1.6551724137931034|
|  AUT|1896 Verano|             5|               1.0|
|  USA|1896 Verano|            20|1.6666666666666667|
|  GBR|1896 Verano|             9|             1.125|
|  CAN|1900 Verano|             2|               1.0|
|  CUB|1900 Verano|             2|               1.0|
|  ITA|1900 Verano|             5|              1.25|
|  ESP|1900 Verano|             2|               2.0|
|  IND|1900 Verano|             2|               1.0|
|  NED|1900 Verano|         


### Class 19

In [16]:
resultados_df.registerTempTable("resultado")
deportista_df.registerTempTable("deportista")
paises_df.registerTempTable("paises")

In [18]:
sql.sql('SELECT * FROM deportista').show(5)

+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|  80|      199|
|            2|            A Lamusi|     1|  23|   170|  60|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|     0|   0|      273|
|            4|Edgar Lindenau Aabye|     1|  34|     0|   0|      278|
|            5|Christine Jacoba ...|     2|  21|   185|  82|      705|
+-------------+--------------------+------+----+------+----+---------+
only showing top 5 rows



In [20]:
sql.sql("""
    SELECT medalla, equipo, sigla from resultado r 
    JOIN deportista d ON r.deportista_id = r.deportista_id
    JOIN paises p ON p.paises_id = d.equipo_id
    WHERE medalla <> "NA"
    ORDER BY sigla DESC
""").show()

23/04/11 06:19:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, equipo, sigla
 Schema: paises_id, equipo, sigla
Expected: paises_id but found: id
CSV file: file:///home/daielchom/projects/spark_notebooks/files/paises.csv


[Stage 8:>                                                          (0 + 1) / 2]

23/04/11 06:19:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 970
 Schema: equipo_id
Expected: equipo_id but found: 970
CSV file: file:///home/daielchom/projects/spark_notebooks/files/deportista2.csv


ERROR:root:KeyboardInterrupt while sending command.                 (0 + 1) / 1]
Traceback (most recent call last):
  File "/home/daielchom/venvs/spark/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/daielchom/venvs/spark/lib/python3.8/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

## Class 21

In [11]:
!head -n 5 files/deportistaError.csv

deportista_id,nombre,genero,edad,altura,peso,equipo_id
1,A Dijiang,1,24,180,80,199
2,A Lamusi,1,23,170,60,199
3,Gunnar Nielsen Aaby,1,24,,,273
4,Edgar Lindenau Aabye,1,34,,,278


In [9]:
deportisita_error = spark.textFile(path+'deportistaError.csv').map(lambda l:l.split(','))
deportisita_error = deportisita_error.mapPartitionsWithIndex(remove_header)

In [10]:
deportisita_error.take(3)

                                                                                

[['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '', '', '273']]

In [29]:
deportisita_error.map(lambda l: (l[0],l[1],l[2],l[3],l[4],l[5],l[6]))
schema_error = StructType([
    StructField('deportista_id', StringType(), False),
    StructField('nombre', StringType(), False),
    StructField('genero', StringType(), False),
    StructField('edad', StringType(), False),
    StructField('altura', StringType(), False),
    StructField('peso', StringType(), False),
    StructField('equipo_id', StringType(), False),
])
#deportisita_error_df = sql.createDataFrame(deportisita_error, schema_error)

In [34]:
deportisita_error_df = sql.read.schema(schema_error).option('header','true').csv(path+'/deportistaError.csv')

In [35]:
deportisita_error_df.show(5)

+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|  80|      199|
|            2|            A Lamusi|     1|  23|   170|  60|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|  null|null|      273|
|            4|Edgar Lindenau Aabye|     1|  34|  null|null|      278|
|            5|Christine Jacoba ...|     2|  21|   185|  82|      705|
+-------------+--------------------+------+----+------+----+---------+
only showing top 5 rows



In [36]:
from pyspark.sql.functions import udf

def conversion_enteros(valor):
    return int(valor) if valor is not None else 0

conversion_enteros_udf = udf(lambda z: conversion_enteros(z), IntegerType())
sql.udf.register('conversion_enteros_udf', conversion_enteros_udf)

23/04/11 06:43:56 WARN SimpleFunctionRegistry: The function conversion_enteros_udf replaced a previously registered function.


<function __main__.<lambda>(z)>

In [38]:
deportisita_error_df.select(conversion_enteros_udf('altura').alias('alturaUDF')).show(5)

+---------+
|alturaUDF|
+---------+
|      180|
|      170|
|        0|
|        0|
|      185|
+---------+
only showing top 5 rows



### Class 23

In [18]:
from pyspark.storagelevel import StorageLevel

In [13]:
medallistaXAnio.is_cached

False

In [14]:
medallistaXAnio.rdd.cache()

MapPartitionsRDD[89] at javaToPython at NativeMethodAccessorImpl.java:0

In [16]:
medallistaXAnio.rdd.getStorageLevel()

StorageLevel(False, True, False, False, 1)

In [20]:
medallistaXAnio.rdd.unpersist()

MapPartitionsRDD[89] at javaToPython at NativeMethodAccessorImpl.java:0

In [21]:
medallistaXAnio.rdd.persist(StorageLevel.MEMORY_AND_DISK_2)

MapPartitionsRDD[89] at javaToPython at NativeMethodAccessorImpl.java:0

In [22]:
medallistaXAnio.is_cached

False

In [24]:
medallistaXAnio.rdd.getStorageLevel()

StorageLevel(True, True, False, False, 2)

In [25]:
StorageLevel.MEMORY_AND_DISK_3 = StorageLevel(True, True, False, False, 3)

In [26]:
medallistaXAnio.rdd.unpersist()

MapPartitionsRDD[89] at javaToPython at NativeMethodAccessorImpl.java:0

In [27]:
medallistaXAnio.rdd.persist(StorageLevel.MEMORY_AND_DISK_3)

MapPartitionsRDD[89] at javaToPython at NativeMethodAccessorImpl.java:0

In [28]:
medallistaXAnio.rdd.getStorageLevel()

StorageLevel(True, True, False, False, 3)