In [1]:
import findspark
findspark.init()

import pandas as pd
import pyspark

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local[*]")\
        .appName('myDataFrame')\
        .getOrCreate()

#### 1. Importar el csv de "data/WorldCupPlayers.csv"
#### Visualizar los datos

In [11]:
cup_df = spark.read.csv ("C:/data/WorldCupPlayers.csv", 
                          inferSchema = True, 
                          header = True) 
cup_df.show (10)

+-------+-------+-------------+-------------------+-------+------------+----------------+--------+---------+
|RoundID|MatchID|Team Initials|         Coach Name|Line-up|Shirt Number|     Player Name|Position|    Event|
+-------+-------+-------------+-------------------+-------+------------+----------------+--------+---------+
|    201|   1096|          FRA|CAUDRON Raoul (FRA)|      S|           0|     Alex THEPOT|      GK|     NULL|
|    201|   1096|          MEX|   LUQUE Juan (MEX)|      S|           0| Oscar BONFIGLIO|      GK|     NULL|
|    201|   1096|          FRA|CAUDRON Raoul (FRA)|      S|           0|Marcel LANGILLER|    NULL|     G40'|
|    201|   1096|          MEX|   LUQUE Juan (MEX)|      S|           0|    Juan CARRENO|    NULL|     G70'|
|    201|   1096|          FRA|CAUDRON Raoul (FRA)|      S|           0| Ernest LIBERATI|    NULL|     NULL|
|    201|   1096|          MEX|   LUQUE Juan (MEX)|      S|           0|    Rafael GARZA|       C|     NULL|
|    201|   1096|  

In [None]:
## ¿que tipo de datos contiene cada variable?

In [12]:
cup_df.printSchema()

root
 |-- RoundID: integer (nullable = true)
 |-- MatchID: integer (nullable = true)
 |-- Team Initials: string (nullable = true)
 |-- Coach Name: string (nullable = true)
 |-- Line-up: string (nullable = true)
 |-- Shirt Number: integer (nullable = true)
 |-- Player Name: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Event: string (nullable = true)



In [None]:
## ¿Cuantos registros hay?

In [13]:
cup_df.count()

37784

In [None]:
## Obtén los principales estadísticos de Position

In [14]:
cup_df.describe('Position').show()

+-------+--------+
|summary|Position|
+-------+--------+
|  count|    4143|
|   mean|    NULL|
| stddev|    NULL|
|    min|       C|
|    max|     GKC|
+-------+--------+



In [None]:
## Slecciona y muestra los registros distintos de 'Player Name','Coach Name'

In [15]:
cup_df.select('Player Name','Coach Name').distinct().show()

+--------------------+--------------------+
|         Player Name|          Coach Name|
+--------------------+--------------------+
|    Arturo FERNANDEZ| BRU Francisco (ESP)|
|Cayetano CARRERAS...|DURAND LAGUNA Jos...|
|  Ernesto MASCHERONI|SUPPICI Alberto (...|
|          Aziz FAHMY|   McREA James (SCO)|
|        Gyula POLGAR|    NADAS Odon (HUN)|
|  Ernesto ALBARRACIN|PASCUCCI Felipe (...|
| Armando CASTELLAZZI|POZZO Vittorio (ITA)|
|     Jaroslav BOUCEK|   PETRU Karel (TCH)|
|           Erwin NYC|  KALUZA Jozef (POL)|
|     Stanislaw BARAN|  KALUZA Jozef (POL)|
|     Fernando ROLDAN|BUCCIARDI Arturo ...|
|            Joe MACA|  JEFFREY Bill (SCO)|
|               INDIO|  MOREIRA Zeze (BRA)|
|      Rene DEREUDDRE|PIBAROT Pierre (FRA)|
|    Anton MALATINSKY|    CEJP Josef (TCH)|
|    Alberto MARIOTTI|LORENZO Juan Carl...|
|  Alfredo DI STEFANO|HERRERA Helenio (...|
|             FIDELIS| FEOLA Vicente (BRA)|
|     Stoyan YORDANOV|BOZHKOV Stefan (BUL)|
|      Wim RIJSBERGEN| MICHELS R

In [None]:
## ¿Cuantos partidos con el ID de 1096 ha habido?

In [16]:
cup_df.filter(cup_df.MatchID =='1096').count()

33

In [None]:
## Muestra los datos donde la posicion haya sido C y el evento sea G40

In [18]:
cup_df.filter((cup_df.Position == 'C') & (cup_df.Event=="G40'")).show()

+-------+-------+-------------+--------------------+-------+------------+----------------+--------+-----+
|RoundID|MatchID|Team Initials|          Coach Name|Line-up|Shirt Number|     Player Name|Position|Event|
+-------+-------+-------------+--------------------+-------+------------+----------------+--------+-----+
|    201|   1089|          PAR|DURAND LAGUNA Jos...|      S|           0|Luis VARGAS PENA|       C| G40'|
|    429|   1175|          HUN|  DIETZ Karoly (HUN)|      S|           0|   Gyorgy SAROSI|       C| G40'|
+-------+-------+-------------+--------------------+-------+------------+----------------+--------+-----+



In [None]:
## Utiliza Spark SQL para mostras los registros donde el MatchID sea mayor o igual a 20

In [19]:
cup_df.createOrReplaceTempView("temp_cuptable")

spark.sql("select * from temp_cuptable where MatchID >= 20").show()

+-------+-------+-------------+-------------------+-------+------------+-----------------+--------+---------+
|RoundID|MatchID|Team Initials|         Coach Name|Line-up|Shirt Number|      Player Name|Position|    Event|
+-------+-------+-------------+-------------------+-------+------------+-----------------+--------+---------+
|    201|   1096|          FRA|CAUDRON Raoul (FRA)|      S|           0|      Alex THEPOT|      GK|     NULL|
|    201|   1096|          MEX|   LUQUE Juan (MEX)|      S|           0|  Oscar BONFIGLIO|      GK|     NULL|
|    201|   1096|          FRA|CAUDRON Raoul (FRA)|      S|           0| Marcel LANGILLER|    NULL|     G40'|
|    201|   1096|          MEX|   LUQUE Juan (MEX)|      S|           0|     Juan CARRENO|    NULL|     G70'|
|    201|   1096|          FRA|CAUDRON Raoul (FRA)|      S|           0|  Ernest LIBERATI|    NULL|     NULL|
|    201|   1096|          MEX|   LUQUE Juan (MEX)|      S|           0|     Rafael GARZA|       C|     NULL|
|    201| 