In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [None]:
spark = (SparkSession.builder.appName("SalesAnalytics").getOrCreate())

# **Preparacion de los datos**

In [None]:
schema = StructType([
    StructField("Order ID", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Quantity Ordered", StringType(), True),
    StructField("Price Each", StringType(), True),
    StructField("Order Date", StringType(), True),
    StructField("Purchase Adress", StringType(), True),
])

In [None]:
sales_data_fpath = "./salesdata"
sales_raw_df = (
spark.read.format("csv")
       .option("header", True)
       .schema(schema)
       .load(sales_data_fpath)
)

In [None]:
sales_raw_df.show(10)

In [None]:
sales_raw_df.printSchema()

# **Trabajar con Operaciones Estrucuturadas**

#### Leer archivos JSON

In [None]:
from pyspark.sql.types import ArrayType, FloatType, DateType, BooleanType, IntegerType

In [None]:
persons_schema = StructType([
    StructField("id", IntegerType(),True),
    StructField("first_name", StringType(),True),
    StructField("last_name", StringType(),True),
    StructField("fav_movies", ArrayType(StringType()),True),
    StructField("salary", FloatType(),True),
    StructField("image_url", StringType(),True),
    StructField("date_of_birth", DateType(),True),
    StructField("active", BooleanType(),True)
])

In [None]:
json_file_path = './data/persons.json'
persons_df = (spark.read.json(json_file_path, persons_schema, multiLine="True"))

In [None]:
persons_df.printSchema()

In [None]:
persons_df.show(10)

In [None]:
persons_df.show(10, truncate=False)

#### **Columnas y expresiones**

In [None]:
from pyspark.sql.functions import col, expr

col()Es más eficiente y directo.

In [None]:
persons_df.select(col("first_name"), col("last_name"), col("date_of_birth")).show(5)

In [None]:
persons_df.select(expr("first_name"), expr("last_name"), expr("date_of_birth")).show(5)

- Parsea la cadena como una expresión SQL
- Menos eficiente quecol()
- Útil cuando necesitas hacer transformaciones más complejas

##### Concatenar, hacer cálculos, etc.
df.select(
    expr("first_name || ' ' || last_name AS full_name"),
    expr("age + 1 AS next_year_age")
)

In [None]:
from pyspark.sql.functions import concat_ws, round

In [33]:
(persons_df.select(
    col("first_name"), 
    col("last_name"),
    concat_ws('', col("first_name"), col("last_name")).alias("full_name"),
    col("salary"),
    (col("salary")* 0.10 + col("salary")).alias("salary_calculate")
).withColumn("salary_increase", round(col("salary_calculate"), 3))
                .select("full_name", "salary", "salary_increase"
                )).show(10) 

+---------------+-------+---------------+
|      full_name| salary|salary_increase|
+---------------+-------+---------------+
|     DrucyPoppy|1463.36|       1609.696|
|   EmelyneBlaza|3006.04|       3306.644|
|      MaxRettie|1422.88|       1565.168|
|     IlarioKean|3561.36|       3917.496|
|    ToddyDrexel|4934.87|       5428.357|
| OswaldPetrolli|1153.23|       1268.553|
|   AdrianClarey|1044.73|       1149.203|
|DominicaGoodnow|1147.76|       1262.536|
|   EmorySlocomb|1082.11|       1190.321|
|   JeremiasBode|3472.63|       3819.893|
+---------------+-------+---------------+
only showing top 10 rows



In [None]:
pip install autopep8

In [32]:
(persons_df.select(
    col("first_name"), 
    col("last_name"),
    concat_ws('', col("first_name"), col("last_name")).alias("full_name"),
    col("salary"),
    expr("salary * 0.10 + salary").alias("salary_calculate")
).withColumn("salary_increase", round(col("salary_calculate"), 3))
                .select("full_name", "salary", "salary_increase"
                )).show(10)

+---------------+-------+---------------+
|      full_name| salary|salary_increase|
+---------------+-------+---------------+
|     DrucyPoppy|1463.36|       1609.696|
|   EmelyneBlaza|3006.04|       3306.644|
|      MaxRettie|1422.88|       1565.168|
|     IlarioKean|3561.36|       3917.496|
|    ToddyDrexel|4934.87|       5428.357|
| OswaldPetrolli|1153.23|       1268.553|
|   AdrianClarey|1044.73|       1149.203|
|DominicaGoodnow|1147.76|       1262.536|
|   EmorySlocomb|1082.11|       1190.321|
|   JeremiasBode|3472.63|       3819.893|
+---------------+-------+---------------+
only showing top 10 rows



In [34]:
persons_df.filter('salary <= 3000').show(10)

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|      Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  3|       Max|     Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|
|  6|    Oswald|   Petrolli|[Wing and the Thi...|1153.23|http://dummyimage...|   1986-09-02| false|
|  7|    Adrian|     Clarey|[Walking Tall, Pa...|1044.73|http://dummyimage...|   1971-08-24| false|
|  8|  Dominica|    Goodnow|    [Hearts Divided]|1147.76|http://dummyimage...|   1973-08-27| false|
|  9|     Emory|    Slocomb|[Snake and Crane ...|1082.11|http://dummyimage...|   1974-06-08|  true|
| 11|   Timothy|     Ervine|[Land of the Lost...|1147.61|http://dummyimage...|   1971-06-02| false|


In [35]:
persons_df.where('salary <= 3000').show(10)

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|      Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  3|       Max|     Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|
|  6|    Oswald|   Petrolli|[Wing and the Thi...|1153.23|http://dummyimage...|   1986-09-02| false|
|  7|    Adrian|     Clarey|[Walking Tall, Pa...|1044.73|http://dummyimage...|   1971-08-24| false|
|  8|  Dominica|    Goodnow|    [Hearts Divided]|1147.76|http://dummyimage...|   1973-08-27| false|
|  9|     Emory|    Slocomb|[Snake and Crane ...|1082.11|http://dummyimage...|   1974-06-08|  true|
| 11|   Timothy|     Ervine|[Land of the Lost...|1147.61|http://dummyimage...|   1971-06-02| false|


In [36]:
persons_df.where((col("salary") <= 3000) & (col("active") == True)).show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  9|     Emory|  Slocomb|[Snake and Crane ...|1082.11|http://dummyimage...|   1974-06-08|  true|
| 16|   Margaux| Archbold|[And Now a Word f...|1013.75|http://dummyimage...|   1988-07-29|  true|
| 26|     Clive|      Lax|             [Rabid]|2126.87|http://dummyimage...|   1981-10-26|  true|
| 33|  Sherline|  Primett|   [Jungle Fighters]|2309.39|http://dummyimage...|   1972-07-23|  true|
| 34|     Davis|    Pinks|          [Hounddog]|1337.14|http://dummyimage...|   1989-07-27|  true|
| 37|    Carlen|  Sharply|[Dr. Jekyll and M...|2051.85|http://dummyimage...|   2002-06-01|  true|
| 40|    Jordan|   L

In [38]:
from pyspark.sql.functions import year

In [41]:
persons_df.filter((year("date_of_birth") == 2000) | (year("date_of_birth") == 1989)).show()

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| 14|   Ambrosi|   Vidineev|[Wall Street: Mon...|4550.88|http://dummyimage...|   1989-07-20|  true|
| 15|    Feodor|Nancekivell|   [Monsoon Wedding]|2218.46|http://dummyimage...|   2000-10-07| false|
| 18|     Alfie|   Hatliffe|     [Lord of Tears]| 3893.1|http://dummyimage...|   1989-06-21|  true|
| 25|     Kelcy|     Wogdon|    [Iron Mask, The]|4512.51|http://dummyimage...|   2000-10-20|  true|
| 32|      Redd|   Akenhead|[Century of the D...| 2470.9|http://dummyimage...|   2000-06-05| false|
| 34|     Davis|      Pinks|          [Hounddog]|1337.14|http://dummyimage...|   1989-07-27|  true|
| 61|    Shanna|    Samples|[Thomas in Love (...| 2703.0|http://dummyimage...|   1989-07-07| false|


In [42]:
from pyspark.sql.functions import array_contains

In [46]:
persons_df.where(array_contains(persons_df.fav_movies, "Lord of Tears")).show()

+---+----------+---------+---------------+------+--------------------+-------------+------+
| id|first_name|last_name|     fav_movies|salary|           image_url|date_of_birth|active|
+---+----------+---------+---------------+------+--------------------+-------------+------+
| 18|     Alfie| Hatliffe|[Lord of Tears]|3893.1|http://dummyimage...|   1989-06-21|  true|
+---+----------+---------+---------------+------+--------------------+-------------+------+



#### Distinct, Drop Duplicates, Order By

In [47]:
from pyspark.sql.functions import count, desc

In [48]:
persons_df.select("active").show(10)

+------+
|active|
+------+
|  true|
| false|
| false|
|  true|
|  true|
| false|
| false|
| false|
|  true|
|  true|
+------+
only showing top 10 rows



In [49]:
persons_df.select("active").distinct().show()

+------+
|active|
+------+
|  true|
| false|
+------+



In [50]:
(persons_df.select(col("first_name"),
                  year(col("date_of_birth")).alias("year"),
                  col("active")).orderBy("year", "first_name")).show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|       Sky|1971| false|
|   Timothy|1971| false|
|    Lucita|1972|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|     Toddy|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
+----------+----+------+
only showing top 10 rows



In [51]:
dropped_df = (persons_df.select(col("first_name"),
                               year(col("date_of_birth")).alias("year"),
                               col("active")).dropDuplicates(["year","active"])).orderBy("year","first_name")

In [52]:
dropped_df.show()

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
|   Balduin|1974| false|
|     Emory|1974|  true|
|    Janean|1975|  true|
|       Bev|1976|  true|
| Franciska|1976| false|
|     Johny|1977| false|
|    Daveta|1978| false|
|   Guthrie|1978|  true|
|      Maxi|1979| false|
|   Melinda|1979|  true|
|    Carter|1980| false|
|   Loralyn|1980|  true|
|     Clive|1981|  true|
|   Leanora|1981| false|
+----------+----+------+
only showing top 20 rows



#### Uniones y Filas

In [53]:
from pyspark.sql import Row

In [73]:
person_row = Row(101, "Robert", "Ownes", ["Men in Black III", "Home Alone"], 4300.64, "http://someimage.com", "1964-08-18", True)

In [74]:
persons_row_list = [Row(102, "Kenny", "Bobien",["Men in Black III", "Home Alone"],4300.64, "http://someimage.com", "1964-08-18", True),
                      Row(103, "Sara", "Devine",["Men in Black III", "Home Alone"],4300.64, "http://someimage.com", "1964-08-18", True)]

In [75]:
persons_row_list.append(person_row)

In [76]:
print(persons_row_list)

[<Row(102, 'Kenny', 'Bobien', ['Men in Black III', 'Home Alone'], 4300.64, 'http://someimage.com', '1964-08-18', True)>, <Row(103, 'Sara', 'Devine', ['Men in Black III', 'Home Alone'], 4300.64, 'http://someimage.com', '1964-08-18', True)>, <Row(101, 'Robert', 'Ownes', ['Men in Black III', 'Home Alone'], 4300.64, 'http://someimage.com', '1964-08-18', True)>]


In [77]:
person_row[1]

'Robert'

In [78]:
new_persons_df = spark.createDataFrame(persons_row_list, ["id", "first_name", "last_name", "fav_movies", "salary", "image_url", "date_of_birth", "active"])

In [79]:
add_persons_df = persons_df.union(new_persons_df)
add_persons_df.sort(desc("id")).show()

+---+----------+---------+--------------------+------------------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies|            salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+------------------+--------------------+-------------+------+
|103|      Sara|   Devine|[Men in Black III...|           4300.64|http://someimage.com|   1964-08-18|  true|
|102|     Kenny|   Bobien|[Men in Black III...|           4300.64|http://someimage.com|   1964-08-18|  true|
|101|    Robert|    Ownes|[Men in Black III...|           4300.64|http://someimage.com|   1964-08-18|  true|
|100|    Virgie| Domanski|[Horseman, The, S...| 2165.929931640625|http://dummyimage...|   2002-01-05|  true|
| 99|   Rozalie|   Wannop|[Suddenly, The No...|1259.6400146484375|http://dummyimage...|   1997-03-25| false|
| 98|     Davin|     Labb|[Viva Riva!, Kill...| 1452.739990234375|http://dummyimage...|   1988-01-27|  true|
| 97|      Rodi|   

In [80]:
from pyspark.sql.functions import round

In [83]:
aug_persons_df1 = persons_df.withColumn("salary_increase", expr("salary * 0.10 + salary"))
aug_persons_df1.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+------------------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|   salary_increase|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+------------------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|1609.6959838867188|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|  3306.64404296875|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|1565.1680053710938|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|3917.4961181640624|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|  5428.35712890625|
|  6|    Oswald| Petrolli|[Wing and the Thi...|1153.23|http://du

In [85]:
aug_persons_df1.columns

['id',
 'first_name',
 'last_name',
 'fav_movies',
 'salary',
 'image_url',
 'date_of_birth',
 'active',
 'salary_increase']

In [93]:
aug_persons_df2 = (aug_persons_df1
                   .withColumn("birth_year", year("date_of_birth"))
                   .withColumnRenamed("fav_movies", "movies")
                   .withColumn("salary_x10", round(col("salary_increase"),3))
                   .drop("salary_increase"))

In [94]:
aug_persons_df2.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
| id|first_name|last_name|              movies| salary|           image_url|date_of_birth|active|birth_year|salary_x10|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|      1991|  1609.696|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|      1991|  3306.644|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|      1990|  1565.168|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|      1987|  3917.496|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|      1992|  5428.357|
|  6|    Oswald| Petrolli|[Wing and the 