# Aula 02 - Framework de Big Data


### Instalando a biblioteca PySpark


In [1]:
# também foi instalado o findspark / pip install findspark - através de 'New Terminal'


In [2]:
#Importando o findspark
 
import findspark
findspark.init
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()


In [3]:
#  Testando o uso no SQL no Python!

df = spark.sql('''select 'Sucesso total, estamos online' as hello''')
df.show()


+--------------------+
|               hello|
+--------------------+
|Sucesso total, es...|
+--------------------+



### Instalação das Bibliotecas

In [5]:
# Fazendo referência do PySpark.sql para algumas funções específicas que existem pra tratamento de dados

# Import spark libraries

from pyspark.sql import Row, DataFrame
from pyspark.sql.types import StringType, StructType, StructField, IntegerType
from pyspark.sql.functions import col, expr, lit, substring, concat, concat_ws, when, coalesce
from pyspark.sql import functions as F # for more sql functions
from functools import reduce


# Data Manipulation using Spark

In [6]:
df = spark.read.csv('Framework_A01_banklist.csv', sep = ',', inferSchema= True, header= True)

print('df.count:' , df.count())
print('df.col ct: ', len(df.columns))
print('df.columns: ', df.columns)

# Aqui teremos o resultado da quantidade de linhas, quantidade de colunas e quais são as colunas.


df.count: 561
df.col ct:  6
df.columns:  ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date']


## Using SQL in PySpark

In [8]:
df.createOrReplaceTempView('banklist')

df_check = spark.sql('''select 'BankName', City, 'Closing Date' from banklist''')
df_check.show(4, truncate=False)


+--------+-------------+------------+
|BankName|City         |Closing Date|
+--------+-------------+------------+
|BankName|Barboursville|Closing Date|
|BankName|Ericson      |Closing Date|
|BankName|Newark       |Closing Date|
|BankName|Maumee       |Closing Date|
+--------+-------------+------------+
only showing top 4 rows



## DataFrame Basic Operations

In [9]:
df.describe().show()


+-------+--------------------+-------+----+-----------------+---------------------+------------+
|summary|           Bank Name|   City|  ST|             CERT|Acquiring Institution|Closing Date|
+-------+--------------------+-------+----+-----------------+---------------------+------------+
|  count|                 561|    561| 561|              561|                  561|         561|
|   mean|                NULL|   NULL|NULL|31685.68449197861|                 NULL|        NULL|
| stddev|                NULL|   NULL|NULL|16446.65659309965|                 NULL|        NULL|
|    min|1st American Stat...|Acworth|  AL|               91|      1st United Bank|    1-Aug-08|
|    max|               ebank|Wyoming|  WY|            58701|  Your Community Bank|    9-Sep-11|
+-------+--------------------+-------+----+-----------------+---------------------+------------+



In [12]:
df.describe('City', 'ST').show()

# selecionando apenas 2 colunas do DataSet / para saber a estatística (describe)


+-------+-------+----+
|summary|   City|  ST|
+-------+-------+----+
|  count|    561| 561|
|   mean|   NULL|NULL|
| stddev|   NULL|NULL|
|    min|Acworth|  AL|
|    max|Wyoming|  WY|
+-------+-------+----+



## Count, Columns and Schema

In [17]:
print('Total de Linhas:' , df.count())
print('Total de Colunas: ', len(df.columns))
print('Colunas :', df.columns)
print('Tipo de Dados :', df.dtypes)
print('Schema : ', df.schema)


Total de Linhas: 561
Total de Colunas:  6
Colunas : ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date']
Tipo de Dados : [('Bank Name', 'string'), ('City', 'string'), ('ST', 'string'), ('CERT', 'int'), ('Acquiring Institution', 'string'), ('Closing Date', 'string')]
Schema :  StructType([StructField('Bank Name', StringType(), True), StructField('City', StringType(), True), StructField('ST', StringType(), True), StructField('CERT', IntegerType(), True), StructField('Acquiring Institution', StringType(), True), StructField('Closing Date', StringType(), True)])


In [18]:
df.printSchema()


root
 |-- Bank Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- ST: string (nullable = true)
 |-- CERT: integer (nullable = true)
 |-- Acquiring Institution: string (nullable = true)
 |-- Closing Date: string (nullable = true)



## Remove Duplicates

In [19]:
df = df.dropDuplicates()
print('df.count :', df.count())
print('df.columns :', df.columns)


df.count : 561
df.columns : ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date']


Após resultado acima, conseguimos saber que o df não possui linhas duplicadas, pois no início da explaração dos dados verificamos que o df possui 561 linhas.

## Select Specific Columns

In [21]:
df2 = df.select(*['Bank Name', 'City'])
df2.show()


+--------------------+----------------+
|           Bank Name|            City|
+--------------------+----------------+
| First Bank of Idaho|         Ketchum|
|Amcore Bank, Nati...|        Rockford|
|        Venture Bank|           Lacey|
|First State Bank ...|           Altus|
|Valley Capital Ba...|            Mesa|
|Michigan Heritage...|Farmington Hills|
|Columbia Savings ...|      Cincinnati|
|       Fidelity Bank|        Dearborn|
|The Park Avenue Bank|        Valdosta|
|Western Commercia...|  Woodland Hills|
|        Syringa Bank|           Boise|
|Republic Federal ...|           Miami|
|Westside Communit...|University Place|
|   First United Bank|           Crete|
|HarVest Bank of M...|    Gaithersburg|
|            BankEast|       Knoxville|
|    Polk County Bank|        Johnston|
|Colorado Capital ...|     Castle Rock|
|         Access Bank|        Champlin|
|Pacific National ...|   San Francisco|
+--------------------+----------------+
only showing top 20 rows



## Select Multiple Columns

Selecionando todas as colunas, exceto a 'CERT' e 'ST'

In [22]:
col_1 = list(set(df.columns) - {'CERT', 'ST'})
df2 = df.select(*col_1)
df2.show(5)


+---------------------+--------+--------------------+------------+
|Acquiring Institution|    City|           Bank Name|Closing Date|
+---------------------+--------+--------------------+------------+
|      U.S. Bank, N.A.| Ketchum| First Bank of Idaho|   24-Apr-09|
|          Harris N.A.|Rockford|Amcore Bank, Nati...|   23-Apr-10|
| First-Citizens Ba...|   Lacey|        Venture Bank|   11-Sep-09|
|         Herring Bank|   Altus|First State Bank ...|   31-Jul-09|
| Enterprise Bank &...|    Mesa|Valley Capital Ba...|   11-Dec-09|
+---------------------+--------+--------------------+------------+
only showing top 5 rows



## Rename Columns

In [24]:
df2 = df \
    .withColumnRenamed('Bank Name', 'bank_name')\
    .withColumnRenamed('City', 'city')\
    .withColumnRenamed('Acquiring Institution', 'acq_institution')\
    .withColumnRenamed('Closing Date', 'closing_date')\
    .withColumnRenamed('ST', 'state')\
    .withColumnRenamed('CERT','cert') #\

df2.show(5)


+--------------------+--------+-----+-----+--------------------+------------+
|           bank_name|    city|state| cert|     acq_institution|closing_date|
+--------------------+--------+-----+-----+--------------------+------------+
| First Bank of Idaho| Ketchum|   ID|34396|     U.S. Bank, N.A.|   24-Apr-09|
|Amcore Bank, Nati...|Rockford|   IL| 3735|         Harris N.A.|   23-Apr-10|
|        Venture Bank|   Lacey|   WA|22868|First-Citizens Ba...|   11-Sep-09|
|First State Bank ...|   Altus|   OK| 9873|        Herring Bank|   31-Jul-09|
|Valley Capital Ba...|    Mesa|   AZ|58399|Enterprise Bank &...|   11-Dec-09|
+--------------------+--------+-----+-----+--------------------+------------+
only showing top 5 rows

