In [1]:
! sudo apt-get update
! sudo mkdir -p /usr/share/man/man1
! sudo apt-get install -y openjdk-11-jdk
! pip install pyspark

Hit:1 http://deb.debian.org/debian buster InRelease
Hit:2 http://deb.debian.org/debian-security buster/updates InRelease
Hit:3 http://deb.debian.org/debian buster-updates InRelease




openjdk-11-jdk is already the newest version (11.0.18+10-1~deb10u1).
0 upgraded, 0 newly installed, 0 to remove and 12 not upgraded.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

from pyspark.sql.types import StructType, StructField # Estructura del dataframe
from pyspark.sql.types import StringType, IntegerType, FloatType
from pyspark.sql.types import Row 

In [3]:
spark = SparkContext(master='local', appName='replicacion')
sql_context = SQLContext(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/29 03:57:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
!head -5 '/data/deportista.csv'

deportista_id,nombre,genero,edad,altura,peso,equipo_id
1,A Dijiang,1,24,180,80,199
2,A Lamusi,1,23,170,60,199
3,Gunnar Nielsen Aaby,1,24,0,0,273
4,Edgar Lindenau Aabye,1,34,0,0,278


In [5]:
athletes_rdd = spark.textFile('/data/deportista.csv').map(
    lambda line: line.split(',')
)

In [6]:
def remove_header(index, iterator):
    return iter(list(iterator)[1:])

In [7]:
athletes_rdd = athletes_rdd.mapPartitionsWithIndex(remove_header)

In [8]:
athletes_rdd = athletes_rdd.map(lambda l: (
    l[0],
    l[1],
    l[2],
    l[3],
    l[4],
    l[5],
    l[6]
    )
)

athletes_schema = StructType([
    StructField("deportista_id", StringType(), False),
    StructField("nombre", StringType(), False),
    StructField("genero", StringType(), False),
    StructField("edad", StringType(), False),
    StructField("altura", StringType(), False), 
    StructField("peso", StringType(), False),
    StructField("equipo_id", StringType(), False)
])
athletes_df = sql_context.createDataFrame(athletes_rdd, athletes_schema)

In [9]:
athletes_df.show()

+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|  80|      199|
|            2|            A Lamusi|     1|  23|   170|  60|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|     0|   0|      273|
|            4|Edgar Lindenau Aabye|     1|  34|     0|   0|      278|
|            5|Christine Jacoba ...|     2|  21|   185|  82|      705|
|            6|     Per Knut Aaland|     1|  31|   188|  75|     1096|
|            7|        John Aalberg|     1|  31|   183|  72|     1096|
|            8|Cornelia Cor Aalt...|     2|  18|   168|   0|      705|
|            9|    Antti Sami Aalto|     1|  26|   186|  96|      350|
|           10|Einar Ferdinand E...|     1|  26|     0|   0|      350|
|           11|  Jorma Ilmari Aalto|     1|  22|   182|76.5|      350|
|     

In [10]:
athletes_by_team = athletes_df.groupBy("equipo_id").count()

In [11]:
from pyspark.storagelevel import StorageLevel

In [12]:
# preguntar si esta almacenado en el sistema, si es false como es el caso Spark lo tiene que volver a "revivir"
athletes_by_team.is_cached

False

In [13]:
# guardar el rdd en cache
athletes_by_team.cache()

DataFrame[equipo_id: string, count: bigint]

In [14]:
# de que manera el rdd persiste (useDisk, useMemory, useOffHeap, deserialized, replication=1)
athletes_by_team.rdd.getStorageLevel() 

StorageLevel(False, False, False, False, 1)

In [15]:
athletes_by_team.rdd.unpersist() # bajarlo del disco

MapPartitionsRDD[19] at javaToPython at NativeMethodAccessorImpl.java:0

In [16]:
# persistir en memoria y en disco, replicado dos veces
athletes_by_team.rdd.persist(StorageLevel.MEMORY_AND_DISK_2)

MapPartitionsRDD[19] at javaToPython at NativeMethodAccessorImpl.java:0

In [17]:
# crear una persistencia nueva con tres particiones
StorageLevel.MEMORY_AND_DISK_3 = StorageLevel(True, True, False, False, 3)

In [18]:
athletes_by_team.rdd.unpersist() # bajarlo del disco

MapPartitionsRDD[19] at javaToPython at NativeMethodAccessorImpl.java:0

In [19]:
# aplicar la persistencia nueva
athletes_by_team.rdd.persist(StorageLevel.MEMORY_AND_DISK_3)

MapPartitionsRDD[19] at javaToPython at NativeMethodAccessorImpl.java:0