# Library Checkouts - ETL Spark


https://spark.apache.org/docs/latest/api/python/user_guide/index.html

In [1]:
import os

os.environ['SPARK_HOME'] = 'C:\spark\spark-3.1.2-bin-hadoop2.7'
os.environ['JAVA_HOME'] = 'C:\Program Files\Java\jre1.8.0_361'
os.environ['HADOOP_HOME'] = 'C:\spark\spark-3.2.1-bin-hadoop2.7\hadoop'

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, StringType
from pyspark.sql.functions import *



In [2]:
spark = SparkSession.builder\
    .master('local[*]')\
    .appName("Iniciando com Spark")\
    .config('spark.ui.port', '4050')\
    .getOrCreate()

spark

# Importando dados

In [3]:
seattle_library = spark.read.option("header","true").csv('Datasets\Checkouts_by_Title.csv')
seattle_library.show()

+----------+------------+------------+------------+-------------+---------+--------------------+----+--------------------+--------------------+--------------------+---------------+
|UsageClass|CheckoutType|MaterialType|CheckoutYear|CheckoutMonth|Checkouts|               Title|ISBN|             Creator|            Subjects|           Publisher|PublicationYear|
+----------+------------+------------+------------+-------------+---------+--------------------+----+--------------------+--------------------+--------------------+---------------+
|  Physical|     Horizon|        BOOK|        2007|           12|        1|Opposite the Cros...|null|       Haymon, S. T.|Haymon S T Childh...| St. Martin's Press,|         c1988.|
|  Physical|     Horizon|        BOOK|        2007|           12|        1| Power of persuasion|null|                null|Love stories, Pol...|                null|           null|
|  Physical|     Horizon|        BOOK|        2007|           12|        1|Cinema nirvana en...

### Adicionando ID_Material

In [4]:
seattle_library = seattle_library\
                    .withColumn("ID_Material_String",
                                concat_ws("|","UsageClass","MaterialType", "Title", "ISBN", "Creator", "Subjects", "Publisher", "PublicationYear"))

seattle_library.show(10,False)

+----------+------------+------------+------------+-------------+---------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Adicionando ID_Checkout

In [9]:
seattle_library = seattle_library.withColumn("ID_Checkout", monotonically_increasing_id())

seattle_library.show(10)

+----------+------------+------------+------------+-------------+---------+--------------------+----+--------------------+--------------------+--------------------+---------------+--------------------+-----------+
|UsageClass|CheckoutType|MaterialType|CheckoutYear|CheckoutMonth|Checkouts|               Title|ISBN|             Creator|            Subjects|           Publisher|PublicationYear|  ID_Material_String|ID_Checkout|
+----------+------------+------------+------------+-------------+---------+--------------------+----+--------------------+--------------------+--------------------+---------------+--------------------+-----------+
|  Physical|     Horizon|        BOOK|        2007|           12|        1|Opposite the Cros...|null|       Haymon, S. T.|Haymon S T Childh...| St. Martin's Press,|         c1988.|Physical|BOOK|Opp...|          0|
|  Physical|     Horizon|        BOOK|        2007|           12|        1| Power of persuasion|null|                null|Love stories, Pol...| 

In [10]:
seattle_library.printSchema()


root
 |-- UsageClass: string (nullable = true)
 |-- CheckoutType: string (nullable = true)
 |-- MaterialType: string (nullable = true)
 |-- CheckoutYear: string (nullable = true)
 |-- CheckoutMonth: string (nullable = true)
 |-- Checkouts: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Creator: string (nullable = true)
 |-- Subjects: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- PublicationYear: string (nullable = true)
 |-- ID_Material_String: string (nullable = false)
 |-- ID_Checkout: long (nullable = false)



### Contando quantidade de valores nulos por coluna

In [25]:
seattle_library.select([count(when(isnull(c),1)).alias(c) for c in seattle_library.columns]).show()

+----------+------------+------------+------------+-------------+---------+-----+--------+--------+--------+---------+---------------+------------------+-----------+
|UsageClass|CheckoutType|MaterialType|CheckoutYear|CheckoutMonth|Checkouts|Title|    ISBN| Creator|Subjects|Publisher|PublicationYear|ID_Material_String|ID_Checkout|
+----------+------------+------------+------------+-------------+---------+-----+--------+--------+--------+---------+---------------+------------------+-----------+
|         0|           0|           0|           0|            0|        0|    0|40244242|12769239| 1776845|  9517288|        9836061|                 0|          0|
+----------+------------+------------+------------+-------------+---------+-----+--------+--------+--------+---------+---------------+------------------+-----------+



In [27]:
seattle_library.select('*')\
    .orderBy('Title',asceding=False)\
    .show(10)


+----------+------------+------------+------------+-------------+---------+--------------------+----+-----------------+--------+--------------------+---------------+--------------------+------------+
|UsageClass|CheckoutType|MaterialType|CheckoutYear|CheckoutMonth|Checkouts|               Title|ISBN|          Creator|Subjects|           Publisher|PublicationYear|  ID_Material_String| ID_Checkout|
+----------+------------+------------+------------+-------------+---------+--------------------+----+-----------------+--------+--------------------+---------------+--------------------+------------+
|   Digital|     Freegal|        SONG|        2016|           10|        1|        ! (Foreword)|null|Pain Of Salvation|    null|                null|           null|Digital|SONG|! (F...|300647727668|
|   Digital|      Hoopla|       MUSIC|        2014|           11|        1|  !! Going Places !!|null|             null|     Pop|Shout Factory Rec...|           null|Digital|MUSIC|!! ...|214748801695|


In [32]:
seattle_library.select('CheckoutYear','Checkouts','Title')\
    .groupBy('CheckoutYear','Title')\
    .count()\
    .orderBy('count',asceding=False)\
    .show()

+------------+--------------------+-----+
|CheckoutYear|               Title|count|
+------------+--------------------+-----+
|        2007|          next place|   11|
|        2007|Mazel tov Jewish ...|   11|
|        2007|X 1999 Vol 14 Con...|    9|
|        2007|pocket of tunes s...|    8|
|        2007|Iron orchid / Stu...|   12|
|        2007|If on a winter's ...|   12|
|        2007|The virgin and th...|    1|
|        2008|Things seen and u...|    6|
|        2008|ILLM Zhizn kak pe...|    2|
|        2008|Sex with the ligh...|   10|
|        2008|Katie John. Pictu...|    5|
|        2008|fascinating world...|    9|
|        2008|Telling stories [...|   12|
|        2008|Let me hear your ...|    8|
|        2008|Baths your guide ...|    8|
|        2008|David Burkes new ...|    4|
|        2008| Sixth grade secrets|    4|
|        2008|Heroes in Greek m...|   10|
|        2008|From head to toe ...|   11|
|        2008|Opportunities in ...|    4|
+------------+--------------------

## Selecionando Dados

In [5]:
seattle_library\
    .select('*')

DataFrame[UsageClass: string, CheckoutType: string, MaterialType: string, CheckoutYear: string, CheckoutMonth: string, Checkouts: string, Title: string, ISBN: string, Creator: string, Subjects: string, Publisher: string, PublicationYear: string]

In [15]:
#Material
seattle_library\
    .select('ID_Material_String','UsageClass','Title','ISBN','Publisher','PublicationYear')\
    .show(5)

+--------------------+----------+--------------------+----+-------------------+---------------+
|  ID_Material_String|UsageClass|               Title|ISBN|          Publisher|PublicationYear|
+--------------------+----------+--------------------+----+-------------------+---------------+
|Physical|BOOK|Opp...|  Physical|Opposite the Cros...|null|St. Martin's Press,|         c1988.|
|Physical|BOOK|Pow...|  Physical| Power of persuasion|null|               null|           null|
|Physical|BOOK|Cin...|  Physical|Cinema nirvana en...|null|               null|           null|
|Physical|BOOK|Nep...|  Physical| Nepodvedennye itogi|null|               null|           null|
|Physical|VIDEODIS...|  Physical|Xu Zhian He Yunsh...|null|               null|           null|
+--------------------+----------+--------------------+----+-------------------+---------------+
only showing top 5 rows



In [23]:
#Checkouts_Month
seattle_library\
    .select('ID_Checkout','CheckoutType','CheckoutYear','CheckoutMonth','Checkouts','ID_Material_String')\
    .show(5)

+-----------+------------+------------+-------------+---------+--------------------+
|ID_Checkout|CheckoutType|CheckoutYear|CheckoutMonth|Checkouts|  ID_Material_String|
+-----------+------------+------------+-------------+---------+--------------------+
|          0|     Horizon|        2007|           12|        1|Physical|BOOK|Opp...|
|          1|     Horizon|        2007|           12|        1|Physical|BOOK|Pow...|
|          2|     Horizon|        2007|           12|        1|Physical|BOOK|Cin...|
|          3|     Horizon|        2007|           12|        2|Physical|BOOK|Nep...|
|          4|     Horizon|        2007|           12|        4|Physical|VIDEODIS...|
+-----------+------------+------------+-------------+---------+--------------------+
only showing top 5 rows



In [19]:
#Material_Type
seattle_library\
    .select('MaterialType','ID_Material_String')\
    .show(5)

+------------+--------------------+
|MaterialType|  ID_Material_String|
+------------+--------------------+
|        BOOK|Physical|BOOK|Opp...|
|        BOOK|Physical|BOOK|Pow...|
|        BOOK|Physical|BOOK|Cin...|
|        BOOK|Physical|BOOK|Nep...|
|   VIDEODISC|Physical|VIDEODIS...|
+------------+--------------------+
only showing top 5 rows



In [20]:
#Creator
seattle_library\
    .select('Creator','ID_Material_String')\
    .show(5)

+-------------+--------------------+
|      Creator|  ID_Material_String|
+-------------+--------------------+
|Haymon, S. T.|Physical|BOOK|Opp...|
|         null|Physical|BOOK|Pow...|
|         null|Physical|BOOK|Cin...|
|         null|Physical|BOOK|Nep...|
|         null|Physical|VIDEODIS...|
+-------------+--------------------+
only showing top 5 rows



In [22]:
#Subjects
seattle_library\
    .select('Subjects','ID_Material_String')\
    .show(5,False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Subjects                                                                                                                                                                                                                                     |ID_Material_String                                                                                                                                                                                                                                       