# Hello World - Spark


https://spark.apache.org/docs/latest/api/python/user_guide/index.html

In [1]:
import os

os.environ['SPARK_HOME'] = 'C:\spark\spark-3.1.2-bin-hadoop2.7'
os.environ['JAVA_HOME'] = 'C:\Program Files\Java\jre1.8.0_361'
os.environ['HADOOP_HOME'] = 'C:\spark\spark-3.2.1-bin-hadoop2.7\hadoop'

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, StringType
from pyspark.sql.functions import *



In [2]:
spark = SparkSession.builder\
    .master('local[*]')\
    .appName("Iniciando com Spark")\
    .config('spark.ui.port', '4050')\
    .getOrCreate()

spark

# Importando dados

In [7]:
seattle_library = spark.read.option("header","true").csv('Checkouts_by_Title.csv')
seattle_library.show()

+----------+------------+------------+------------+-------------+---------+--------------------+----+--------------------+--------------------+--------------------+---------------+
|UsageClass|CheckoutType|MaterialType|CheckoutYear|CheckoutMonth|Checkouts|               Title|ISBN|             Creator|            Subjects|           Publisher|PublicationYear|
+----------+------------+------------+------------+-------------+---------+--------------------+----+--------------------+--------------------+--------------------+---------------+
|  Physical|     Horizon|        BOOK|        2007|           12|        1|Opposite the Cros...|null|       Haymon, S. T.|Haymon S T Childh...| St. Martin's Press,|         c1988.|
|  Physical|     Horizon|        BOOK|        2007|           12|        1| Power of persuasion|null|                null|Love stories, Pol...|                null|           null|
|  Physical|     Horizon|        BOOK|        2007|           12|        1|Cinema nirvana en...

### Adicionando ID_Material

In [18]:
seattle_library = seattle_library\
                    .withColumn("ID_Material",
                                sha2(concat_ws("|","UsageClass","MaterialType", "Title", "ISBN", "Creator", "Subjects", "Publisher", "PublicationYear"),256))

seattle_library.show(10,False)

+----------+------------+------------+------------+-------------+---------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------+---------------+----------------------------------------------------------------+
|UsageClass|CheckoutType|MaterialType|CheckoutYear|CheckoutMonth|Checkouts|Title                                                                                                                                                                                             

In [14]:
seattle_library.ID_Material = sha2(seattle_library.ID_Material,256)

In [15]:
seattle_library.show(10)

+----------+------------+------------+------------+-------------+---------+--------------------+----+--------------------+--------------------+--------------------+---------------+--------------------+
|UsageClass|CheckoutType|MaterialType|CheckoutYear|CheckoutMonth|Checkouts|               Title|ISBN|             Creator|            Subjects|           Publisher|PublicationYear|         ID_Material|
+----------+------------+------------+------------+-------------+---------+--------------------+----+--------------------+--------------------+--------------------+---------------+--------------------+
|  Physical|     Horizon|        BOOK|        2007|           12|        1|Opposite the Cros...|null|       Haymon, S. T.|Haymon S T Childh...| St. Martin's Press,|         c1988.|Physical|BOOK|Opp...|
|  Physical|     Horizon|        BOOK|        2007|           12|        1| Power of persuasion|null|                null|Love stories, Pol...|                null|           null|Physical|BOO

### Adicionando ID_Checkout

In [19]:
seattle_library = seattle_library.withColumn("ID_Checkout", monotonically_increasing_id())

seattle_library.show(10)

+----------+------------+------------+------------+-------------+---------+--------------------+----+--------------------+--------------------+--------------------+---------------+--------------------+-----------+
|UsageClass|CheckoutType|MaterialType|CheckoutYear|CheckoutMonth|Checkouts|               Title|ISBN|             Creator|            Subjects|           Publisher|PublicationYear|         ID_Material|ID_Checkout|
+----------+------------+------------+------------+-------------+---------+--------------------+----+--------------------+--------------------+--------------------+---------------+--------------------+-----------+
|  Physical|     Horizon|        BOOK|        2007|           12|        1|Opposite the Cros...|null|       Haymon, S. T.|Haymon S T Childh...| St. Martin's Press,|         c1988.|435cb8e8e17d73133...|          0|
|  Physical|     Horizon|        BOOK|        2007|           12|        1| Power of persuasion|null|                null|Love stories, Pol...| 

In [20]:
seattle_library.printSchema()


root
 |-- UsageClass: string (nullable = true)
 |-- CheckoutType: string (nullable = true)
 |-- MaterialType: string (nullable = true)
 |-- CheckoutYear: string (nullable = true)
 |-- CheckoutMonth: string (nullable = true)
 |-- Checkouts: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Creator: string (nullable = true)
 |-- Subjects: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- PublicationYear: string (nullable = true)
 |-- ID_Material: string (nullable = true)
 |-- ID_Checkout: long (nullable = false)



### Contando quantidade de valores nulos por coluna

In [7]:
seattle_library.select([count(when(isnull(c),1)).alias(c) for c in seattle_library.columns]).show()

+----------+------------+------------+------------+-------------+---------+-----+--------+--------+--------+---------+---------------+------------------+-----------+
|UsageClass|CheckoutType|MaterialType|CheckoutYear|CheckoutMonth|Checkouts|Title|    ISBN| Creator|Subjects|Publisher|PublicationYear|ID_Material_String|ID_Checkout|
+----------+------------+------------+------------+-------------+---------+-----+--------+--------+--------+---------+---------------+------------------+-----------+
|         0|           0|           0|           0|            0|        0|    0|40244242|12769239| 1776845|  9517288|        9836061|                 0|          0|
+----------+------------+------------+------------+-------------+---------+-----+--------+--------+--------+---------+---------------+------------------+-----------+



In [8]:
seattle_library.select('*')\
    .orderBy('Title',asceding=False)\
    .show(10)


+----------+------------+------------+------------+-------------+---------+--------------------+----+-----------------+--------+--------------------+---------------+--------------------+------------+
|UsageClass|CheckoutType|MaterialType|CheckoutYear|CheckoutMonth|Checkouts|               Title|ISBN|          Creator|Subjects|           Publisher|PublicationYear|  ID_Material_String| ID_Checkout|
+----------+------------+------------+------------+-------------+---------+--------------------+----+-----------------+--------+--------------------+---------------+--------------------+------------+
|   Digital|     Freegal|        SONG|        2016|           10|        1|        ! (Foreword)|null|Pain Of Salvation|    null|                null|           null|Digital|SONG|! (F...|300647727668|
|   Digital|      Hoopla|       MUSIC|        2014|           11|        1|  !! Going Places !!|null|             null|     Pop|Shout Factory Rec...|           null|Digital|MUSIC|!! ...|214748801695|


In [9]:
seattle_library.select('CheckoutYear','Checkouts','Title')\
    .groupBy('CheckoutYear','Title')\
    .count()\
    .orderBy('count',asceding=False)\
    .show()

+------------+--------------------+-----+
|CheckoutYear|               Title|count|
+------------+--------------------+-----+
|        2008|    De mujer a mujer|    1|
|        2007|The virgin and th...|    1|
|        2008|Eternity Row a St...|    1|
|        2008|      Yoidore tenshi|    1|
|        2008|Bizarre beautiful...|    1|
|        2008|Frank O. Gehry : ...|    1|
|        2008|Contemporary art ...|    1|
|        2008|War and peace a n...|    1|
|        2008|Quite a year for ...|    1|
|        2008|Group homes for t...|    1|
|        2008|World encyclopedi...|    1|
|        2008|    Carnegies excuse|    1|
|        2008|The Civil War. To...|    1|
|        2008|   good day for soup|    1|
|        2008|       Tropical heat|    1|
|        2008|Guns in America a...|    1|
|        2008|Beckett in perfor...|    1|
|        2008|From prairie to p...|    1|
|        2008|The magic of apo ...|    1|
|        2008|       lepers return|    1|
+------------+--------------------

## Selecionando Dados

In [10]:
seattle_library\
    .select('*')

DataFrame[UsageClass: string, CheckoutType: string, MaterialType: string, CheckoutYear: string, CheckoutMonth: string, Checkouts: string, Title: string, ISBN: string, Creator: string, Subjects: string, Publisher: string, PublicationYear: string, ID_Material_String: string, ID_Checkout: bigint]

In [26]:
#Material
seattle_library_material = seattle_library\
            .select('ID_Material','UsageClass','Title','ISBN','Publisher','PublicationYear')

seattle_library_material.show(5)

+--------------------+----------+--------------------+----+-------------------+---------------+
|         ID_Material|UsageClass|               Title|ISBN|          Publisher|PublicationYear|
+--------------------+----------+--------------------+----+-------------------+---------------+
|435cb8e8e17d73133...|  Physical|Opposite the Cros...|null|St. Martin's Press,|         c1988.|
|8dd007d2173f57e16...|  Physical| Power of persuasion|null|               null|           null|
|6bf3ac5e1d619e457...|  Physical|Cinema nirvana en...|null|               null|           null|
|401b0417fcbc904b5...|  Physical| Nepodvedennye itogi|null|               null|           null|
|95686048521296ba2...|  Physical|Xu Zhian He Yunsh...|null|               null|           null|
+--------------------+----------+--------------------+----+-------------------+---------------+
only showing top 5 rows



In [45]:
seattle_library_material.select('Publisher').distinct().collect()

[Row(Publisher='Hyperion Books for Children,'),
 Row(Publisher='Martingale,'),
 Row(Publisher='University Press of Idaho,'),
 Row(Publisher='Crown Pub.,'),
 Row(Publisher='Lions Gate Entertainment,'),
 Row(Publisher="Putnam's,"),
 Row(Publisher='Dodd, Mead,'),
 Row(Publisher='Criterion Collection ; Distributed by Home Vision Entertainment,'),
 Row(Publisher='Syren Book Co.,'),
 Row(Publisher='Hammond, Hammond & Co.'),
 Row(Publisher='"""Vagrius""'),
 Row(Publisher='Jewish Publication Society of America,'),
 Row(Publisher='Morning Glory Publishers : Distributed by China International Book Trading Corp.,'),
 Row(Publisher='Nations Books,'),
 Row(Publisher='Hutchinson,'),
 Row(Publisher='Mango,'),
 Row(Publisher='Nhà xuất bản Mỹ thuật ; CTY Văn Hóa Minh Trí - NS. Văn Lang,'),
 Row(Publisher='Para Pub.,'),
 Row(Publisher='Fawcett Columbia,'),
 Row(Publisher='Goethe Johann Wolfgang von 1749 1832 M„rchen'),
 Row(Publisher='Twenty-first Century Books,'),
 Row(Publisher='Chess/MCA,'),
 Row(Pub

In [27]:
#Checkouts_Month
seattle_library_checkouts_month = seattle_library\
            .select('ID_Checkout','CheckoutType','CheckoutYear','CheckoutMonth','Checkouts','ID_Material')

seattle_library_checkouts_month.show(5)

+-----------+------------+------------+-------------+---------+--------------------+
|ID_Checkout|CheckoutType|CheckoutYear|CheckoutMonth|Checkouts|         ID_Material|
+-----------+------------+------------+-------------+---------+--------------------+
|          0|     Horizon|        2007|           12|        1|435cb8e8e17d73133...|
|          1|     Horizon|        2007|           12|        1|8dd007d2173f57e16...|
|          2|     Horizon|        2007|           12|        1|6bf3ac5e1d619e457...|
|          3|     Horizon|        2007|           12|        2|401b0417fcbc904b5...|
|          4|     Horizon|        2007|           12|        4|95686048521296ba2...|
+-----------+------------+------------+-------------+---------+--------------------+
only showing top 5 rows



In [28]:
#Material_Type
seattle_library_material_type = seattle_library\
        .select('MaterialType','ID_Material')\
        .withColumn("ID_MaterialType", monotonically_increasing_id())

seattle_library_material_type.show(5)

+------------+--------------------+---------------+
|MaterialType|         ID_Material|ID_MaterialType|
+------------+--------------------+---------------+
|        BOOK|435cb8e8e17d73133...|              0|
|        BOOK|8dd007d2173f57e16...|              1|
|        BOOK|6bf3ac5e1d619e457...|              2|
|        BOOK|401b0417fcbc904b5...|              3|
|   VIDEODISC|95686048521296ba2...|              4|
+------------+--------------------+---------------+
only showing top 5 rows



In [43]:
seattle_library_material_type.select('MaterialType').distinct().collect()

[Row(MaterialType='MICROFORM'),
 Row(MaterialType='GLOBE'),
 Row(MaterialType='REGPRINT, SOUNDDISC'),
 Row(MaterialType='BOOK'),
 Row(MaterialType='ER, VIDEOCASS'),
 Row(MaterialType='VIDEOREC'),
 Row(MaterialType='UNSPECIFIED'),
 Row(MaterialType='PICTURE, VIDEODISC'),
 Row(MaterialType='MAP, VIEW'),
 Row(MaterialType='SECTION'),
 Row(MaterialType='SONG'),
 Row(MaterialType='SOUNDCASS'),
 Row(MaterialType='COMPFILE'),
 Row(MaterialType='NONPROJGRAPH'),
 Row(MaterialType='KIT'),
 Row(MaterialType='SOUNDCASS, VIDEOCASS'),
 Row(MaterialType='VISUAL'),
 Row(MaterialType='CHART'),
 Row(MaterialType='ER, VIDEOREC'),
 Row(MaterialType='VIDEODISC'),
 Row(MaterialType='SLIDE, SOUNDCASS'),
 Row(MaterialType='LARGEPRINT'),
 Row(MaterialType='ER, MAP'),
 Row(MaterialType='SLIDE, VIDEOCASS'),
 Row(MaterialType='MUSIC'),
 Row(MaterialType='CR'),
 Row(MaterialType='MIXED'),
 Row(MaterialType='SOUNDDISC, SOUNDREC'),
 Row(MaterialType='EBOOK'),
 Row(MaterialType='PICTURE'),
 Row(MaterialType='ER, SOUN

In [31]:
#Creator
seattle_library_creator = seattle_library\
        .select('Creator','ID_Material')\
        .withColumn("ID_Creator", monotonically_increasing_id())

seattle_library_creator.show(10)

+--------------------+--------------------+----------+
|             Creator|         ID_Material|ID_Creator|
+--------------------+--------------------+----------+
|       Haymon, S. T.|435cb8e8e17d73133...|         0|
|                null|8dd007d2173f57e16...|         1|
|                null|6bf3ac5e1d619e457...|         2|
|                null|401b0417fcbc904b5...|         3|
|                null|95686048521296ba2...|         4|
|     Handler, Daniel|e2294ea01cff8690a...|         5|
|                null|6012d180ef2cabce9...|         6|
|Haywood, Carolyn,...|32d67b645caf07ed7...|         7|
|                null|148f940fa46e290b3...|         8|
|                null|6922005b309323114...|         9|
+--------------------+--------------------+----------+
only showing top 10 rows



In [42]:
seattle_library_creator.select('Creator').distinct().collect()

[Row(Creator='Fermor, Patrick Leigh'),
 Row(Creator='Dietz, Steven'),
 Row(Creator='Kling, Andrew A., 1961-'),
 Row(Creator='Resta, Monica'),
 Row(Creator='An, Dong.'),
 Row(Creator='Yeats, W. B. (William Butler), 1865-1939'),
 Row(Creator='Cathy Maxwell'),
 Row(Creator='Argueta, Jorge'),
 Row(Creator='Donald Calthrop'),
 Row(Creator='Koike, Mariko, 1952-'),
 Row(Creator='Hoffman, Mary, 1945-'),
 Row(Creator='Sortun, Ana, 1967-'),
 Row(Creator='McCain, John, 1936-'),
 Row(Creator='Ferrell, Miralee, 1953-'),
 Row(Creator='Speke, John Hanning, 1827-1864'),
 Row(Creator='Auden, W. H. (Wystan Hugh), 1907-1973'),
 Row(Creator='Kandel, Susan, 1961-'),
 Row(Creator='Justice, Daniel Heath'),
 Row(Creator='Banks, Iain, 1954-2013'),
 Row(Creator='Roters, Eberhard'),
 Row(Creator='Press, Petra'),
 Row(Creator='Anthony, Ted'),
 Row(Creator='MacGregor, Greg'),
 Row(Creator='Xuan, YongSheng'),
 Row(Creator='Robinson, Randall, 1941-'),
 Row(Creator='Aksenov, Vasiliĭ, 1932-2009'),
 Row(Creator='Kinerk

In [39]:
#Subjects
seattle_library_subjects = seattle_library\
    .select('Subjects','ID_Material')\
    .withColumn('Subjects',split(seattle_library.Subjects,","))

seattle_library_subjects.show(5,False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+
|Subjects                                                                                                                                                                                                                                           |ID_Material                                                     |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+
|[Haymon S T Childhood and youth,  Authors English 20th century Bio

https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.functions.split.html

https://sparkbyexamples.com/pyspark/pyspark-split-dataframe-column-into-multiple-columns/

https://www.projectpro.io/recipes/define-split-function-pyspark

https://www.datasciencemadesimple.com/string-split-of-the-columns-in-pyspark/