In [89]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import DataFrameWriter
import os

In [90]:
spark = SparkSession.builder\
.appName("Good Reads")\
.getOrCreate()

Crear el esquema del dataframe único formado por la unión de todos los dataframes.

In [91]:
schema = StructType([StructField("Id", IntegerType(), False),
                          StructField("Name", StringType(), True),
                          StructField("Authors", StringType(), True),
                          StructField("PagesNumber", IntegerType(), True),
                          StructField("RatingDist1", StringType(), True),
                          StructField("RatingDist2", StringType(), True),
                          StructField("RatingDist3", StringType(), True),
                          StructField("RatingDist4", StringType(), True),
                          StructField("RatingDist5", StringType(), True),
                          StructField("RatingDistTotal", StringType(), True),
                          StructField("PublishDay", StringType(), True),
                          StructField("PublishMonth", StringType(), True),
                          StructField("PublishYear", StringType(), True),
                          StructField("Publisher", StringType(), True),
                          StructField("CountsOfReview", IntegerType(), True),
                          StructField("Language", StringType(), True),
                          StructField("Rating", DoubleType(), True),
                          StructField("ISBN", StringType(), True)])

Cargar en una lista de dataframes todos los archivos .csv del directorio books

In [92]:
path = 'books'
booklist = list()
dataframelist = list()

for file in os.listdir(path):
    booklist.append(file)
    
for book in booklist:
    dataframelist.append(
     spark.read.format("csv")
            .option("header", "true")
            .option("quote", "\"")
            .option("escape", "\"")
            .option("ignoreLeadingWhiteSpace", "true")
            .option("ignoreTrailingWhiteSpace", "true")
            .option("multiLine", "true")
            .load("books/"+ book)
    )

Crear dataframe vacío con el esquema definido anteriormente

In [93]:
dataframe = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

In [94]:
for dframe in dataframelist:
    dataframe =  dataframe.unionByName(dframe, allowMissingColumns = True)

EJERCICIO 1

In [95]:
dataframe.agg(mean("Rating").alias("Rating promedio")).show()

+-----------------+
|  Rating promedio|
+-----------------+
|2.910691632055565|
+-----------------+



EJERCICIO 2 

In [96]:
dataframe.groupBy("Authors").agg(mean("Rating").alias("Rating promedio")).show()

+--------------------+------------------+
|             Authors|   Rating promedio|
+--------------------+------------------+
|Michael Eliot Howard|3.8511538461538466|
|           Neal Ford|              3.37|
|       Rachel Andrew|              3.13|
|          James Frey|3.7236363636363636|
|         Bill Bright|3.4657894736842105|
|     Eric Klinenberg|              3.63|
|     Karen Armstrong|3.9357627118644074|
|          Dava Sobel|            3.7875|
|        Richard King|3.3862499999999995|
|        Joseph Monti|               4.0|
|        Philip Hofer|              1.75|
|         Ann Beattie| 3.473421052631579|
|          Jack Ahern|               2.5|
|        Steve Kokker|               3.9|
|        Diane Conway|              3.04|
|       Julian Harvey|               0.0|
|     Thomas S. Mowle|1.3333333333333333|
|        Helena Grice|            4.1325|
|        Sarah Leslie| 4.066666666666666|
|     Richard T. Ryan|              4.07|
+--------------------+------------

EJERCICIO 3

In [97]:
dataframe.groupBy("Publisher").agg(mean("Rating").alias("Rating promedio")).show()

+--------------------+------------------+
|           Publisher|   Rating promedio|
+--------------------+------------------+
|       The New Press| 3.766389891696751|
|        Chosen Books| 3.778565573770492|
|            Capstone|2.6408407079646015|
|      Celestial Arts|3.3293503937007873|
|        Lorenz Books| 3.000905923344948|
|Gerald Duckworth ...|2.7852499999999996|
|                 DAW|3.7503158933859826|
|       Digireads.com|3.8898404255319154|
|John Benjamins Pu...|1.5163268062120192|
|           IVP Books|3.7881268011527385|
|               Ember| 3.751428571428572|
|Triumph Entertain...|2.5866666666666664|
| Orange Frazer Press|2.8805479452054796|
|       No Exit Press|              3.75|
|Chicago Review Press|3.3994816053511703|
|     Dance Books Ltd|         2.5921875|
| Harvard Art Museums|2.6578947368421053|
|                 HQN|3.8506956521739126|
|Arcadia Publishin...| 3.159514348785872|
|Hachette Littérature| 3.545315315315315|
+--------------------+------------

EJERCICIO 4

In [98]:
dataframe.agg(mean("PagesNumber").alias("Numero de páginas promedio de todos los libros")).show()

+----------------------------------------------+
|Numero de páginas promedio de todos los libros|
+----------------------------------------------+
|                             277.0461801593246|
+----------------------------------------------+



EJERCICIO 5

In [66]:
dataframe.groupBy("Authors").agg(mean("PagesNumber").alias("Numero de páginas promedio de todos los libros")).show()

+--------------------+----------------------------------------------+
|             Authors|Numero de páginas promedio de todos los libros|
+--------------------+----------------------------------------------+
|Michael Eliot Howard|                            224.57692307692307|
|           Neal Ford|                                         339.0|
|       Rachel Andrew|                            407.72727272727275|
|          James Frey|                            441.09090909090907|
|         Bill Bright|                            278.39622641509436|
|     Eric Klinenberg|                                         309.0|
|     Karen Armstrong|                             330.6896551724138|
|          Dava Sobel|                            254.40740740740742|
|        Richard King|                                       309.125|
|        Joseph Monti|                                         365.0|
|        Philip Hofer|                                         115.0|
|         Ann Beatti

EJERCICIO 6

In [99]:
dataframe.groupBy("Publisher").agg(mean("PagesNumber").alias("Numero de páginas promedio de todos los libros")).show()

+--------------------+----------------------------------------------+
|           Publisher|Numero de páginas promedio de todos los libros|
+--------------------+----------------------------------------------+
|       The New Press|                             290.4241877256318|
|        Chosen Books|                            219.05327868852459|
|            Capstone|                            155.35398230088495|
|      Celestial Arts|                            194.23818897637796|
|        Lorenz Books|                             205.3472706155633|
|Gerald Duckworth ...|                            249.83333333333334|
|                 DAW|                            357.09674234945703|
|       Digireads.com|                            144.15425531914894|
|John Benjamins Pu...|                            325.10195813639433|
|           IVP Books|                            193.17867435158502|
|               Ember|                                         305.2|
|Triumph Entertain..

EJERCICIO 7

In [70]:
dataframe.groupBy("Authors").agg(count("Id").alias("Número de libros publicados")).show()

+--------------------+---------------------------+
|             Authors|Número de libros publicados|
+--------------------+---------------------------+
|Michael Eliot Howard|                         26|
|           Neal Ford|                          4|
|       Rachel Andrew|                         11|
|          James Frey|                         11|
|         Bill Bright|                         53|
|     Eric Klinenberg|                          4|
|     Karen Armstrong|                         58|
|          Dava Sobel|                         27|
|        Richard King|                          8|
|        Joseph Monti|                          1|
|        Philip Hofer|                          2|
|         Ann Beattie|                         38|
|          Jack Ahern|                          1|
|        Steve Kokker|                          3|
|        Diane Conway|                          3|
|       Julian Harvey|                          1|
|     Thomas S. Mowle|         

EJERCICIO 8

In [88]:
dataframe.select("Name","Rating")\
 .orderBy(desc(col("Rating")))\
 .show(15, True)

+--------------------+---------------+
|                Name|         Rating|
+--------------------+---------------+
|The Da Vinci Code...|really liked it|
| Memoirs of a Geisha|really liked it|
|The Da Vinci Code...|really liked it|
|The Da Vinci Code...|really liked it|
|The Da Vinci Code...|really liked it|
|     Of Mice and Men|really liked it|
|The Da Vinci Code...|really liked it|
|     Of Mice and Men|really liked it|
|The Da Vinci Code...|really liked it|
|     Of Mice and Men|really liked it|
|The Da Vinci Code...|really liked it|
|The Da Vinci Code...|really liked it|
| Memoirs of a Geisha|really liked it|
|The Da Vinci Code...|really liked it|
| Memoirs of a Geisha|really liked it|
+--------------------+---------------+
only showing top 15 rows

