# Ejercicio aplicado de DataFrames y Spark SQL

In [1]:
import findspark
findspark.init()

import pandas as pd
import pyspark

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local[*]")\
        .appName('PySpark_Df')\
        .getOrCreate()

In [None]:
## Importa el csv de "data/WorldCupPlayers.csv"
## Visualiza los datos
df = spark.read.csv ("C:/Users/Manuel/Desktop/Pyspark/data/WorldCupPlayers.csv", 
                          inferSchema = True, 
                          header = True)
df.show()

In [None]:
## ¿que tipo de datos contiene cada variable?
df.printSchema()

In [None]:
## ¿Cuantos registros hay?
df.count()

In [None]:
## Obtén los principales estadísticos de Position
df.describe('Position').show()

In [None]:
## Selecciona y muestra los registros distintos de 'Player Name','Coach Name'
df.select('Player Name','Coach Name').distinct().show()

In [None]:
## ¿Cuantos partidos con el ID de 1096 ha habido?
df.filter(df.MatchID =='1096').count()

In [None]:
## Muestra los datos donde la posicion haya sido C y el evento sea G40
df.filter((df.Position == 'C') & (df.Event=="G40'")).show()

In [None]:
## Utiliza Spark SQL para mostras los registros donde el MatchID sea mayor o igual a 20
table = df.createOrReplaceTempView("temp_table")
spark.sql("select * from temp_table where MatchID >= 20").show()

In [None]:
import findspark
findspark.init()

import pandas as pd
import pyspark
from pyspark.sql import SparkSession

spark_int = SparkSession.builder\
        .master("local[*]")\
        .appName('PySpark_Df')\
        .getOrCreate()

data = [("James","null","Smith","36636","M",60000),
        ("Michael","Rose","null","40288","M",70000),
        ("Robert","null","Williams","42114","null",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","null","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]

df = spark_int.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import findspark, pyspark
findspark.init()

In [None]:
spark_core = SparkSession.builder.getOrCreate()

In [None]:
data = [(1, "AAA", "dept1", 1000),
        (2, "BBB", "dept1", 1100),
        (3, "CCC", "dept1", 3000),
        (4, "DDD", "dept1", 1500),
        (5, "EEE", "dept2", 8000),
        (6, "FFF", "dept2", 7200),
        (7, "GGG", "dept3", 7100),
        (8, "HHH", "dept3", 3700),
        (9, "III", "dept3", 4500),
        (10, "JJJ", "dept5", 3400)]

dept = [("dept1", "Departament - 1"),
        ("dept2", "Departament - 2"),
        ("dept3", "Departament - 3"),
        ("dept4", "Departament - 4")]

df = spark_core.createDataFrame(data, ["id", "name", "dept", "salary"])
df2 = spark_core.createDataFrame(dept, ["id","name"])

In [None]:
df.show()

In [None]:
df2.show()

# Operaciones

##### Count
- Cuenta el numero de filas 

In [None]:
df.count()

##### Columns

In [None]:
df.columns

##### dtypes
- Accede al datatype de columnas dentro del dataframe

In [None]:
df.dtypes

##### schema
- Comprueba como Spark almacena el esquema del dataframe

In [None]:
df.schema

##### printSchema

In [None]:
df.printSchema()

##### select
- seleccione columnas del dataframe

In [None]:
df.select("id", "name").show()

##### filter
- Filtrar las filas segun alguna condicion.
- intentemos encontrar las filas con id = 1
- Hay diferentes formas de especificar la condicion

In [None]:
df.filter(df["id"] == 1).show()

In [None]:
df.filter(df.id == 1).show()

In [None]:
df.filter(col("id") == 1).show()

In [None]:
df.filter("id = 1").show()

##### drop
- Elimina una columna en particular

In [None]:
newdf = df.drop("id")
newdf.show()

##### Aggregations
- Podemos usar la funcion groupBy para agrupar los datos y luego usar la funcion "agg" para realizar la agregacion de datos agrupados.

In [None]:
df_modify = df.groupBy("dept") \
    .agg(
        count("salary").alias("count"),
        sum("salary").alias("sum"),
        max("salary").alias("max"),
        min("salary").alias("min"),
        avg("salary").alias("avg")
    ).show()

##### Sorting 
- Ordena los datos segun el "salario". De forma predeterminada.
- La clasificacion se realiza en orden ascendente 

In [None]:
df.sort("salary").show()

In [None]:
# Sort the data in descendig orden
df.sort(desc("salary")).show()

##### Columnas derivadas
- Podemos usar la funcion "withColumn" para derivar la columa en funcion de las columnas existentes.

In [None]:
df.withColumn("bonus", col("salary") * .1).show()