## Instalação do PySpark

Instalando o Java 8

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

KeyboardInterrupt: ignored

Baixando spark

In [None]:
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz

Descompatacando o spark

In [None]:
!tar xf spark-3.1.2-bin-hadoop2.7.tgz

Instalando o findspark

In [None]:
!pip install -q findspark

Instalando o pyspark


In [None]:
!pip install -q pyspark

Definindo as variáveis de ambiente

In [None]:
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/content/spark-3.1.2-bin-hadoop2.7'

Iniciando o spark

In [None]:
import findspark
findspark.init()

## Data Frame

Criando uma sessão


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
                    .master('local[*]')\
                    .appName('Dataframes com spark')\
                    .getOrCreate()

Criando data frame

In [None]:
dados = [
    ('Bruna', '19'), 
    ('Sara', '20'), 
    ('Julie', '21'), 
    ('Raquel', '20')
    ]

colunas = ['Nome', 'Idade']

df1 = spark.createDataFrame(dados, colunas)

Exibindo os dados em colunas

In [None]:
df1.show()

+------+-----+
|  Nome|Idade|
+------+-----+
| Bruna|   19|
|  Sara|   20|
| Julie|   21|
|Raquel|   20|
+------+-----+



In [None]:
df1.limit(2).toPandas()

Unnamed: 0,Nome,Idade
0,Bruna,19
1,Sara,20


In [None]:
from pyspark.sql import Row

df2 = spark.createDataFrame(
    [Row(descricao="Monitor", preco=800, fabricante='Samsung')]
)

In [None]:
df2.show()

+---------+-----+----------+
|descricao|preco|fabricante|
+---------+-----+----------+
|  Monitor|  800|   Samsung|
+---------+-----+----------+



In [None]:
from pyspark.sql import Row

df3 = spark.createDataFrame(
    [Row(descricao="Ryzen 7", preco=1800, fabricante='AMD')]
)

In [None]:
df3.show()

+---------+-----+----------+
|descricao|preco|fabricante|
+---------+-----+----------+
|  Ryzen 7| 1800|       AMD|
+---------+-----+----------+



In [None]:
df_final = df2.unionAll(df3)

In [None]:
df_final.show()

+---------+-----+----------+
|descricao|preco|fabricante|
+---------+-----+----------+
|  Monitor|  800|   Samsung|
|  Ryzen 7| 1800|       AMD|
+---------+-----+----------+



In [None]:
df_final.printSchema()

root
 |-- descricao: string (nullable = true)
 |-- preco: long (nullable = true)
 |-- fabricante: string (nullable = true)



outro exemplo:
df_alunos ==> matricula 
             nome 
             AV1
             AV2
             AV3

In [None]:
from pyspark.sql import Row

df_aluno1 = spark.createDataFrame(
    [Row(matricula=1234, nome='Bruna', AV1=10.0, AV2=10.0, AV3=10.0)]
)

df_aluno2 = spark.createDataFrame(
    [Row(matricula=4567, nome='Sara', AV1=9.0, AV2=9.0, AV3=9.0)]
)

df_aluno3 = spark.createDataFrame(
    [Row(matricula=8910, nome='Julie', AV1=8.0, AV2=8.0, AV3=8.0)]
)

In [None]:
df_alunos = df_aluno1.unionAll(df_aluno2).unionAll(df_aluno3)

In [None]:
df_alunos.show()

+---------+-----+----+----+----+
|matricula| nome| AV1| AV2| AV3|
+---------+-----+----+----+----+
|     1234|Bruna|10.0|10.0|10.0|
|     4567| Sara| 9.0| 9.0| 9.0|
|     8910|Julie| 8.0| 8.0| 8.0|
+---------+-----+----+----+----+



In [None]:
df_alunos.printSchema()

root
 |-- matricula: long (nullable = true)
 |-- nome: string (nullable = true)
 |-- AV1: double (nullable = true)
 |-- AV2: double (nullable = true)
 |-- AV3: double (nullable = true)



## Lendo dados de arquivos


In [None]:
#endereço do arquivo - copiar caminho
path = '/content/sample_data/california_housing_test.csv'

house_df = spark.read.csv(path, sep=',', inferSchema=True, header=True)

In [None]:
house_df.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
|  -118.36|   33.82|              28.0|       67.0|          15.0|      49.0|      11.0|       6.1359|          330000.0|
|  -119.67|   36.33|              19.0|     1241.0|         244.0|     850.0|     237.0|       2.9375|           81700.0|
|  -119.56|   36.51|    

In [None]:
house_df.limit(3000).toPandas()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.30,34.26,43.0,1510.0,310.0,809.0,277.0,3.5990,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0
...,...,...,...,...,...,...,...,...,...
2995,-119.86,34.42,23.0,1450.0,642.0,1258.0,607.0,1.1790,225000.0
2996,-118.14,34.06,27.0,5257.0,1082.0,3496.0,1036.0,3.3906,237200.0
2997,-119.70,36.30,10.0,956.0,201.0,693.0,220.0,2.2895,62000.0
2998,-117.12,34.10,40.0,96.0,14.0,46.0,14.0,3.2708,162500.0


In [None]:
#alterando nome das colunas
house_df = house_df.toDF('longitude', 'latitude', 'mediana_idades', 'total_comodos', 'total_quartos', 'populacao', 'familias', 'mediana_renda_familias', 'mediana_valores')

In [None]:
house_df.show()

+---------+--------+--------------+-------------+-------------+---------+--------+----------------------+---------------+
|longitude|latitude|mediana_idades|total_comodos|total_quartos|populacao|familias|mediana_renda_familias|mediana_valores|
+---------+--------+--------------+-------------+-------------+---------+--------+----------------------+---------------+
|  -122.05|   37.37|          27.0|       3885.0|        661.0|   1537.0|   606.0|                6.6085|       344700.0|
|   -118.3|   34.26|          43.0|       1510.0|        310.0|    809.0|   277.0|                 3.599|       176500.0|
|  -117.81|   33.78|          27.0|       3589.0|        507.0|   1484.0|   495.0|                5.7934|       270500.0|
|  -118.36|   33.82|          28.0|         67.0|         15.0|     49.0|    11.0|                6.1359|       330000.0|
|  -119.67|   36.33|          19.0|       1241.0|        244.0|    850.0|   237.0|                2.9375|        81700.0|
|  -119.56|   36.51|    

In [None]:
house_df.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- mediana_idades: double (nullable = true)
 |-- total_comodos: double (nullable = true)
 |-- total_quartos: double (nullable = true)
 |-- populacao: double (nullable = true)
 |-- familias: double (nullable = true)
 |-- mediana_renda_familias: double (nullable = true)
 |-- mediana_valores: double (nullable = true)



Consultas


In [None]:
#total de linhas
house_df.count()

3000

In [None]:
#exibir a mediana_idades, total_quartos, mediana_valores
house_df.select('mediana_idades', 'total_quartos', 'mediana_valores').show()

+--------------+-------------+---------------+
|mediana_idades|total_quartos|mediana_valores|
+--------------+-------------+---------------+
|          27.0|        661.0|       344700.0|
|          43.0|        310.0|       176500.0|
|          27.0|        507.0|       270500.0|
|          28.0|         15.0|       330000.0|
|          19.0|        244.0|        81700.0|
|          37.0|        213.0|        67000.0|
|          43.0|        225.0|        67000.0|
|          19.0|        471.0|       166900.0|
|          15.0|        617.0|       194400.0|
|          31.0|        632.0|       164200.0|
|          45.0|        249.0|       125000.0|
|          37.0|        166.0|        58300.0|
|          36.0|        182.0|       252600.0|
|          16.0|        694.0|       231200.0|
|          27.0|        325.0|       222500.0|
|          42.0|         40.0|       153100.0|
|          15.0|        123.0|       181300.0|
|          26.0|        607.0|       137500.0|
|          26

In [None]:
#exibir dados com total_quartos > 5000

house_df.select('*').where('total_quartos > 5000').show()

+---------+--------+--------------+-------------+-------------+---------+--------+----------------------+---------------+
|longitude|latitude|mediana_idades|total_comodos|total_quartos|populacao|familias|mediana_renda_familias|mediana_valores|
+---------+--------+--------------+-------------+-------------+---------+--------+----------------------+---------------+
|  -121.53|   38.48|           5.0|      27870.0|       5027.0|  11935.0|  4855.0|                4.8811|       212200.0|
|  -118.44|   33.98|          21.0|      18132.0|       5419.0|   7431.0|  4930.0|                5.3359|       500001.0|
|   -117.2|   33.58|           2.0|      30450.0|       5033.0|   9419.0|  3197.0|                4.5936|       174300.0|
+---------+--------+--------------+-------------+-------------+---------+--------+----------------------+---------------+

