###Instalação do PySpark###

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 63 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 61.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=d3754e5b019ac27080a5d50abc3a8a1ecab5ace8d4ba37dd60e15d59c544b323
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


###Importação dos pacotes de contexto e configuração do Spark###

In [None]:
from pyspark.sql import SparkSession

###Montando o sistema de arquivos do seu Google Drive###

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/My Drive/Spark")

In [None]:
!ls

 alunos.csv				       'Manipulação de carros.ipynb'
 auto-miles-per-gallon.csv		       'Manual PIG.pdf'
 drivers.csv				        Neo4j.pdf
'Ecosistema Hadoop.pdf'			        NoSQL.pdf
'Exemplo didático MapReduce.pdf'	       'O Ecossistema Spark.pdf'
'Exemplo regressao Spark.ipynb'		        pig_script.txt
'Hands on Hadoop.pdf'			        timesheet.csv
 hive.txt				        WordCount-Spark.ipynb
'Introdução ao processamento distribuído.pdf'   words.txt


###Setando a configuração do seu Spark Context###

In [None]:
spark = SparkSession.builder.appName("Exemplo").master("local[*]").getOrCreate()

###Carregando o CSV de carros em um dataframe do Spark###

In [None]:
df_carros = spark.read.csv(path='auto-miles-per-gallon.csv',
# sep=',',
# encoding='UTF-8',
# comment=None,
header=True,
inferSchema=True)

###Imprimindo as primeiras linhas do dataframe###

In [None]:
df_carros.show()

+----+---------+------------+----------+------+------------+---------+--------------------+
| MPG|CYLINDERS|DISPLACEMENT|HORSEPOWER|WEIGHT|ACCELERATION|MODELYEAR|                NAME|
+----+---------+------------+----------+------+------------+---------+--------------------+
|18.0|        8|       307.0|       130|  3504|        12.0|       70|chevrolet chevell...|
|15.0|        8|       350.0|       165|  3693|        11.5|       70|   buick skylark 320|
|18.0|        8|       318.0|       150|  3436|        11.0|       70|  plymouth satellite|
|16.0|        8|       304.0|       150|  3433|        12.0|       70|       amc rebel sst|
|17.0|        8|       302.0|       140|  3449|        10.5|       70|         ford torino|
|15.0|        8|       429.0|       198|  4341|        10.0|       70|    ford galaxie 500|
|14.0|        8|       454.0|       220|  4354|         9.0|       70|    chevrolet impala|
|14.0|        8|       440.0|       215|  4312|         8.5|       70|   plymout

###Imprimindo o esquema do dataframe###

In [None]:
df_carros.printSchema()

root
 |-- MPG: double (nullable = true)
 |-- CYLINDERS: integer (nullable = true)
 |-- DISPLACEMENT: double (nullable = true)
 |-- HORSEPOWER: integer (nullable = true)
 |-- WEIGHT: integer (nullable = true)
 |-- ACCELERATION: double (nullable = true)
 |-- MODELYEAR: integer (nullable = true)
 |-- NAME: string (nullable = true)



In [None]:
df_carros.createOrReplaceTempView("carros")
df_query = spark.sql("select mpg as label, cylinders, displacement, horsepower, weight, modelyear from carros")
df_query.show()

+-----+---------+------------+----------+------+---------+
|label|cylinders|displacement|horsepower|weight|modelyear|
+-----+---------+------------+----------+------+---------+
| 18.0|        8|       307.0|       130|  3504|       70|
| 15.0|        8|       350.0|       165|  3693|       70|
| 18.0|        8|       318.0|       150|  3436|       70|
| 16.0|        8|       304.0|       150|  3433|       70|
| 17.0|        8|       302.0|       140|  3449|       70|
| 15.0|        8|       429.0|       198|  4341|       70|
| 14.0|        8|       454.0|       220|  4354|       70|
| 14.0|        8|       440.0|       215|  4312|       70|
| 14.0|        8|       455.0|       225|  4425|       70|
| 15.0|        8|       390.0|       190|  3850|       70|
| 15.0|        8|       383.0|       170|  3563|       70|
| 14.0|        8|       340.0|       160|  3609|       70|
| 15.0|        8|       400.0|       150|  3761|       70|
| 14.0|        8|       455.0|       225|  3086|       7

In [None]:
df_query.select("cylinders").show();

+---------+
|cylinders|
+---------+
|        8|
|        8|
|        8|
|        8|
|        8|
|        8|
|        8|
|        8|
|        8|
|        8|
|        8|
|        8|
|        8|
|        8|
|        4|
|        6|
|        6|
|        6|
|        4|
|        4|
+---------+
only showing top 20 rows



In [None]:
df_query.filter(df_query['cylinders'] > 6).show();

+-----+---------+------------+----------+------+---------+
|label|cylinders|displacement|horsepower|weight|modelyear|
+-----+---------+------------+----------+------+---------+
| 18.0|        8|       307.0|       130|  3504|       70|
| 15.0|        8|       350.0|       165|  3693|       70|
| 18.0|        8|       318.0|       150|  3436|       70|
| 16.0|        8|       304.0|       150|  3433|       70|
| 17.0|        8|       302.0|       140|  3449|       70|
| 15.0|        8|       429.0|       198|  4341|       70|
| 14.0|        8|       454.0|       220|  4354|       70|
| 14.0|        8|       440.0|       215|  4312|       70|
| 14.0|        8|       455.0|       225|  4425|       70|
| 15.0|        8|       390.0|       190|  3850|       70|
| 15.0|        8|       383.0|       170|  3563|       70|
| 14.0|        8|       340.0|       160|  3609|       70|
| 15.0|        8|       400.0|       150|  3761|       70|
| 14.0|        8|       455.0|       225|  3086|       7

In [None]:
df_query.groupBy("cylinders").count().show();

+---------+-----+
|cylinders|count|
+---------+-----+
|        6|   84|
|        3|    4|
|        5|    3|
|        4|  203|
|        8|  103|
+---------+-----+



In [None]:
df_query.orderBy("modelyear").show()

+-----+---------+------------+----------+------+---------+
|label|cylinders|displacement|horsepower|weight|modelyear|
+-----+---------+------------+----------+------+---------+
| 18.0|        8|       307.0|       130|  3504|       70|
| 15.0|        8|       350.0|       165|  3693|       70|
| 18.0|        8|       318.0|       150|  3436|       70|
| 16.0|        8|       304.0|       150|  3433|       70|
| 17.0|        8|       302.0|       140|  3449|       70|
| 15.0|        8|       429.0|       198|  4341|       70|
| 14.0|        8|       454.0|       220|  4354|       70|
| 14.0|        8|       440.0|       215|  4312|       70|
| 14.0|        8|       455.0|       225|  4425|       70|
| 15.0|        8|       390.0|       190|  3850|       70|
| 15.0|        8|       383.0|       170|  3563|       70|
| 14.0|        8|       340.0|       160|  3609|       70|
| 15.0|        8|       400.0|       150|  3761|       70|
| 14.0|        8|       455.0|       225|  3086|       7