# Dataframe executando consulta SQL

## Instalação dos componentes relacionado ao Spark

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
findspark.init('spark-2.4.4-bin-hadoop2.7')

## Importação das Bibliotecas

In [3]:
import pyspark
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession

## Desenvolvimento

### Criando uam SparkSession e lendo o arquivo CSV

In [4]:
spark1 = SparkSession.builder.appName('SQL').getOrCreate()

In [5]:
PATH = '/content/drive/MyDrive/Colab Notebooks/Estudos/Spark/base/appl_stock.csv'
df = spark1.read.csv(PATH, inferSchema = True, header = True)
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



### Criando uma nova visualização

In [6]:
df.createOrReplaceTempView('stock')

### Executando um SQL simples sobre essa nova visualização

In [7]:
result = spark1.sql("SELECT * FROM stock LIMIT 5")
result

DataFrame[Date: timestamp, Open: double, High: double, Low: double, Close: double, Volume: int, Adj Close: double]

In [8]:
result.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

In [9]:
result.show()

+-------------------+----------+----------+------------------+------------------+---------+------------------+
|               Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+



### Resolvendo queries mais complexas

In [10]:
# Buscando quantos elementos na coluna 'Close' são maiores que 500

count_greater_500 = spark1.sql('SELECT COUNT(Close) FROM stock WHERE Close > 500')
count_greater_500.show()

+------------+
|count(Close)|
+------------+
|         403|
+------------+



In [11]:
# Calculando a Média da coluna Volume entre maior que 120 milhões ou menor que 110 milhões

avg_1 = spark1.sql('SELECT AVG(Open) FROM stock WHERE Volume > 120000000 OR Volume < 110000000')
avg_1.show()

+------------------+
|         avg(Open)|
+------------------+
|309.12406365290224|
+------------------+



### Lendo um arquivo utilizando SQL

In [12]:
try:
  df_sales = spark1.sql(f"SELECT * FROM csv.`/content/drive/MyDrive/Colab Notebooks/Estudos/Spark/base/sales_info.csv`")
  df_sales.show()
except:
  print('Error ao executar a leitura do arquivo.')

+-------+-------+-----+
|    _c0|    _c1|  _c2|
+-------+-------+-----+
|Company| Person|Sales|
|   GOOG|    Sam|  200|
|   GOOG|Charlie|  120|
|   GOOG|  Frank|  340|
|   MSFT|   Tina|  600|
|   MSFT|    Amy|  124|
|   MSFT|Vanessa|  243|
|     FB|   Carl|  870|
|     FB|  Sarah|  350|
|   APPL|   John|  250|
|   APPL|  Linda|  130|
|   APPL|   Mike|  750|
|   APPL|  Chris|  350|
+-------+-------+-----+

