In [4]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, Row
from pyspark.sql import SQLContext

import databricks.koalas as ks
import pandas as pd

In [9]:
spark = SparkContext(master="local", appName="DataFrame")
sqlContext = SQLContext(spark)

21/09/25 15:01:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [10]:
spark

In [11]:
!head -n 5 ./files/juegos.csv

,nombre_juego,annio,temporada,ciudad
1,1896 Verano,1896,Verano,Athina
2,1900 Verano,1900,Verano,Paris
3,1904 Verano,1904,Verano,St. Louis
4,1906 Verano,1906,Verano,Athina


In [12]:
juegoSchema = StructType([
    StructField("juego_id",IntegerType(),False),
    StructField("anio",StringType(),False),
    StructField("temporada", StringType(),False),
    StructField("ciudad",StringType(),False)
])

juegoDF = sqlContext.read.schema(juegoSchema)\
    .option("header","true").csv("./files/juegos.csv")

In [13]:
juegoDF.show(5)

21/09/25 15:03:59 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 5, schema size: 4
CSV file: file:///home/dsandovalflavio/Documents/Estudio/Cursos/Platzi/EscuelaDataScience/PySparkCource/files/juegos.csv


+--------+-----------+---------+------+
|juego_id|       anio|temporada|ciudad|
+--------+-----------+---------+------+
|       1|1896 Verano|     1896|Verano|
|       2|1900 Verano|     1900|Verano|
|       3|1904 Verano|     1904|Verano|
|       4|1906 Verano|     1906|Verano|
|       5|1908 Verano|     1908|Verano|
+--------+-----------+---------+------+
only showing top 5 rows



In [14]:
!head -n 5 ./files/deportista.csv

deportista_id,nombre,genero,edad,altura,peso,equipo_id
1,A Dijiang,1,24,180,80,199
2,A Lamusi,1,23,170,60,199
3,Gunnar Nielsen Aaby,1,24,0,0,273
4,Edgar Lindenau Aabye,1,34,0,0,278


In [35]:
deportistasOlimpicosRDD = spark.textFile("./files/deportista.csv").map(lambda line: line.split(",")) \
    .union(spark.textFile("./files/deportista2.csv").map(lambda line: line.split(",")))

In [36]:
deportistasOlimpicosRDD.take(3)

[['deportista_id', 'nombre', 'genero', 'edad', 'altura', 'peso', 'equipo_id'],
 ['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199']]

In [37]:
def eliminarEncabezado(indice,iterador):
    return iter(list(iterador)[1:])

In [38]:
deportistasOlimpicosRDD = deportistasOlimpicosRDD.mapPartitionsWithIndex(eliminarEncabezado)

In [39]:
deportistasOlimpicosRDD.take(2)

[['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199']]

In [40]:
deportistasOlimpicosRDD = deportistasOlimpicosRDD.map(lambda l: (
                                                                int(l[0]),
                                                                l[1],
                                                                int(l[2]),
                                                                int(l[3]),
                                                                int(l[4]),
                                                                float(l[5]),
                                                                int(l[6]) ))

In [41]:
shema = StructType([
    StructField("deportista_id",IntegerType(),False),
    StructField("nombre",StringType(),False),
    StructField("genero",IntegerType(),False),
    StructField("edad",IntegerType(),False),
    StructField("altura",IntegerType(),False),
    StructField("peso",FloatType(),False),
    StructField("equipo_id",IntegerType(),False),
])

In [42]:
deportistasOlimpicosDF = sqlContext.createDataFrame(deportistasOlimpicosRDD,shema)

In [43]:
deportistasOlimpicosDF.show(5)



+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|80.0|      199|
|            2|            A Lamusi|     1|  23|   170|60.0|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|     0| 0.0|      273|
|            4|Edgar Lindenau Aabye|     1|  34|     0| 0.0|      278|
|            5|Christine Jacoba ...|     2|  21|   185|82.0|      705|
+-------------+--------------------+------+----+------+----+---------+
only showing top 5 rows



In [44]:
deportistasOlimpicosDF.printSchema()

root
 |-- deportista_id: integer (nullable = false)
 |-- nombre: string (nullable = false)
 |-- genero: integer (nullable = false)
 |-- edad: integer (nullable = false)
 |-- altura: integer (nullable = false)
 |-- peso: float (nullable = false)
 |-- equipo_id: integer (nullable = false)



In [45]:
deportistasOlimpicosDF = deportistasOlimpicosDF.withColumnRenamed("genero","sexo").drop("altura")

In [46]:
deportistasOlimpicosDF.printSchema()

root
 |-- deportista_id: integer (nullable = false)
 |-- nombre: string (nullable = false)
 |-- sexo: integer (nullable = false)
 |-- edad: integer (nullable = false)
 |-- peso: float (nullable = false)
 |-- equipo_id: integer (nullable = false)



In [47]:
from pyspark.sql.functions import col, when, lit

deportistasOlimpicosDF = deportistasOlimpicosDF.select("deportista_id","nombre",
                                                      col("edad").alias("edad_jugar"),
                                                      "equipo_id")

In [49]:
deportistasOlimpicosDF = deportistasOlimpicosDF.filter(col("edad_jugar") != 0)

In [51]:
deportistasOlimpicosDF.sort(col("edad_jugar")).show(5)



+-------------+--------------------+----------+---------+
|deportista_id|              nombre|edad_jugar|equipo_id|
+-------------+--------------------+----------+---------+
|        71691|  Dimitrios Loundras|        10|      333|
|        52070|        Etsuko Inada|        11|      514|
|        40129|    Luigina Giavotti|        11|      507|
|        37333|Carlos Bienvenido...|        11|      982|
|        47618|Sonja Henie Toppi...|        11|      742|
+-------------+--------------------+----------+---------+
only showing top 5 rows





In [26]:
deportistasOlimpicosKs = deportistasOlimpicosDF.to_koalas()

In [29]:
deportistasOlimpicosKs.head(5)

21/09/25 15:06:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Unnamed: 0,deportista_id,nombre,genero,edad,altura,peso,equipo_id
0,1,A Dijiang,1,24,180,80.0,199
1,2,A Lamusi,1,23,170,60.0,199
2,3,Gunnar Nielsen Aaby,1,24,0,0.0,273
3,4,Edgar Lindenau Aabye,1,34,0,0.0,278
4,5,Christine Jacoba Aaftink,2,21,185,82.0,705


In [None]:
spark.stop()