# Estudos relacionado ao DataFrame utilizando o Apache Spark.

### Instalação dos componentes relacionado ao Spark

In [4]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [5]:
import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
findspark.init('spark-2.4.4-bin-hadoop2.7')

## Importação das Bibliotecas

In [6]:
import pyspark
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession

## Introdução ao DataFrame

### Criando uma sessão com SparkSession

In [7]:
spark1 = SparkSession.builder.appName('Basics').getOrCreate()

### Leitura de um arquivo JSON 

In [8]:
PATH = '/content/drive/MyDrive/Colab Notebooks/Estudos/Spark/base/people.json'
df = spark1.read.json(PATH)
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



### Esquema dos dados

In [9]:
# Esquema análogo ao SQL, onde pode incluir valores nulos
 
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



### Listagem das colunas do arquivo JSON

In [10]:
df.columns

['age', 'name']

### Descrição Matemática do conteúdo

In [11]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [12]:
df.summary().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    25%|                19|   null|
|    50%|                19|   null|
|    75%|                30|   null|
|    max|                30|Michael|
+-------+------------------+-------+



### Métodos Take e Collect

In [13]:
df.take(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [14]:
df.collect()

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

## Operações Básicas

### Criando uma nova sessão para a leitura de um arquivo CSV

In [15]:
spark2 = SparkSession.builder.appName('Ops').getOrCreate()

In [16]:
PATH = '/content/drive/MyDrive/Colab Notebooks/Estudos/Spark/base/appl_stock.csv'
df = spark2.read.csv(PATH, inferSchema = True, header = True)
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [17]:
df.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

### Listando as Colunas

In [18]:
df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

### Tipos de Colunas

In [19]:
type(df['High'])

pyspark.sql.column.Column

In [20]:
type(df.head(2)[0])

pyspark.sql.types.Row

### Selecionando as Colunas

In [21]:
df.select('High')

DataFrame[High: double]

In [22]:
df.select('High').show()

+------------------+
|              High|
+------------------+
|        214.499996|
|        215.589994|
|            215.23|
|        212.000006|
|        212.000006|
|        213.000002|
|209.76999500000002|
|210.92999500000002|
|210.45999700000002|
|211.59999700000003|
|215.18999900000003|
|        215.549994|
|213.30999599999998|
|        207.499996|
|        204.699999|
|        213.710005|
|            210.58|
|        205.500004|
|        202.199995|
|             196.0|
+------------------+
only showing top 20 rows



In [23]:
df.select(['High', 'Close']).show()

+------------------+------------------+
|              High|             Close|
+------------------+------------------+
|        214.499996|        214.009998|
|        215.589994|        214.379993|
|            215.23|        210.969995|
|        212.000006|            210.58|
|        212.000006|211.98000499999998|
|        213.000002|210.11000299999998|
|209.76999500000002|        207.720001|
|210.92999500000002|        210.650002|
|210.45999700000002|            209.43|
|211.59999700000003|            205.93|
|215.18999900000003|        215.039995|
|        215.549994|            211.73|
|213.30999599999998|        208.069996|
|        207.499996|            197.75|
|        204.699999|        203.070002|
|        213.710005|        205.940001|
|            210.58|        207.880005|
|        205.500004|        199.289995|
|        202.199995|        192.060003|
|             196.0|        194.729998|
+------------------+------------------+
only showing top 20 rows



### Método asDict e head

In [24]:
df.head(2)

[Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039),
 Row(Date=datetime.datetime(2010, 1, 5, 0, 0), Open=214.599998, High=215.589994, Low=213.249994, Close=214.379993, Volume=150476200, Adj Close=27.774976000000002)]

In [25]:
dict1 = df.head(2)[0].asDict()
dict1

{'Adj Close': 27.727039,
 'Close': 214.009998,
 'Date': datetime.datetime(2010, 1, 4, 0, 0),
 'High': 214.499996,
 'Low': 212.38000099999996,
 'Open': 213.429998,
 'Volume': 123432400}

### Método Count

In [26]:
df.count()

1762

### Nova importação de Base


In [27]:
PATH = '/content/drive/MyDrive/Colab Notebooks/Estudos/Spark/base/sales_info.csv'
df2 = spark2.read.csv(PATH, inferSchema = True, header = True)
df2.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



### Selecionando apenas as 'Company' distintas e contando

In [29]:
df2.select('Company').distinct().show()

+-------+
|Company|
+-------+
|   APPL|
|   GOOG|
|     FB|
|   MSFT|
+-------+



In [30]:
df2.select('Company').distinct().count()

4

### Utilizando o método sample para capturar amostragens aleatórios 

In [31]:
df.sample(withReplacement = False, fraction = 0.005, seed = 101).show()

+-------------------+------------------+------------------+------------------+----------+---------+------------------+
|               Date|              Open|              High|               Low|     Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+----------+---------+------------------+
|2010-03-19 00:00:00|224.79000499999998|        225.240002|221.23000299999998|    222.25|139861400|          28.79461|
|2010-04-09 00:00:00|        241.430012|        241.889996|240.46000299999997|241.789993| 83545700|31.326203000000003|
|2010-06-07 00:00:00|        258.289997|         259.14999|        250.550007|250.940002|221735500|         32.511674|
|2011-07-19 00:00:00|             378.0|378.65000200000003|            373.32|376.849987|204786400|48.824515000000005|
|2012-07-24 00:00:00|         607.37999|        609.680016|        598.509987|600.919975|141283100|         77.854922|
|2013-09-04 00:00:00|        499.560005|        

### Adicionando uma nova coluna ao DataFrame 
Fazendo a adição da diferença entre as ações, calculando o valor da açõa em alta pela a ação em baixa

In [32]:
df.withColumn('Range', df['High'] - df['Low']).limit(5).select(['High', 'Low', 'Range']).show()

+----------+------------------+------------------+
|      High|               Low|             Range|
+----------+------------------+------------------+
|214.499996|212.38000099999996|2.1199950000000456|
|215.589994|        213.249994|2.3400000000000034|
|    215.23|        210.750004|          4.479996|
|212.000006|        209.050005|2.9500010000000145|
|212.000006|209.06000500000002| 2.940000999999995|
+----------+------------------+------------------+

