# Estudos relacionado ao DataFrame utilizando o Apache Spark.

### Instalação dos componentes relacionado ao Spark

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
findspark.init('spark-2.4.4-bin-hadoop2.7')

## Importação das Bibliotecas

In [3]:
import pyspark
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession

## Introdução ao DataFrame

### Criando uma sessão com SparkSession

In [4]:
spark1 = SparkSession.builder.appName('Basics').getOrCreate()

### Leitura de um arquivo JSON 

In [5]:
PATH = '/content/drive/MyDrive/Colab Notebooks/Estudos/Spark/base/people.json'
df = spark1.read.json(PATH)
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



### Esquema dos dados

In [6]:
# Esquema análogo ao SQL, onde pode incluir valores nulos
 
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



### Listagem das colunas do arquivo JSON

In [7]:
df.columns

['age', 'name']

### Descrição Matemática do conteúdo

In [8]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [9]:
df.summary().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    25%|                19|   null|
|    50%|                19|   null|
|    75%|                30|   null|
|    max|                30|Michael|
+-------+------------------+-------+



### Métodos Take e Collect

In [10]:
df.take(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [11]:
df.collect()

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

## Operações Básicas

### Criando uma nova sessão para a leitura de um arquivo CSV

In [12]:
spark2 = SparkSession.builder.appName('Ops').getOrCreate()

In [13]:
PATH = '/content/drive/MyDrive/Colab Notebooks/Estudos/Spark/base/appl_stock.csv'
df = spark2.read.csv(PATH, inferSchema = True, header = True)
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [14]:
df.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

### Listando as Colunas

In [15]:
df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

### Tipos de Colunas

In [16]:
type(df['High'])

pyspark.sql.column.Column

In [17]:
type(df.head(2)[0])

pyspark.sql.types.Row

### Selecionando as Colunas

In [18]:
df.select('High')

DataFrame[High: double]

In [19]:
df.select('High').show()

+------------------+
|              High|
+------------------+
|        214.499996|
|        215.589994|
|            215.23|
|        212.000006|
|        212.000006|
|        213.000002|
|209.76999500000002|
|210.92999500000002|
|210.45999700000002|
|211.59999700000003|
|215.18999900000003|
|        215.549994|
|213.30999599999998|
|        207.499996|
|        204.699999|
|        213.710005|
|            210.58|
|        205.500004|
|        202.199995|
|             196.0|
+------------------+
only showing top 20 rows



In [20]:
df.select(['High', 'Close']).show()

+------------------+------------------+
|              High|             Close|
+------------------+------------------+
|        214.499996|        214.009998|
|        215.589994|        214.379993|
|            215.23|        210.969995|
|        212.000006|            210.58|
|        212.000006|211.98000499999998|
|        213.000002|210.11000299999998|
|209.76999500000002|        207.720001|
|210.92999500000002|        210.650002|
|210.45999700000002|            209.43|
|211.59999700000003|            205.93|
|215.18999900000003|        215.039995|
|        215.549994|            211.73|
|213.30999599999998|        208.069996|
|        207.499996|            197.75|
|        204.699999|        203.070002|
|        213.710005|        205.940001|
|            210.58|        207.880005|
|        205.500004|        199.289995|
|        202.199995|        192.060003|
|             196.0|        194.729998|
+------------------+------------------+
only showing top 20 rows



### Método asDict e head

In [21]:
df.head(2)

[Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039),
 Row(Date=datetime.datetime(2010, 1, 5, 0, 0), Open=214.599998, High=215.589994, Low=213.249994, Close=214.379993, Volume=150476200, Adj Close=27.774976000000002)]

In [28]:
dict1 = df.head(2)[0].asDict()
dict1

{'Adj Close': 27.727039,
 'Close': 214.009998,
 'Date': datetime.datetime(2010, 1, 4, 0, 0),
 'High': 214.499996,
 'Low': 212.38000099999996,
 'Open': 213.429998,
 'Volume': 123432400}

### Método Count

In [29]:
df.count()

1762