# Configuração de biblioteca do PySpark no Google

In [28]:
!pip install pyspark



# Criando a sessão do SparkContext e SparkSession

In [29]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [30]:
sc = SparkContext.getOrCreate()

In [31]:
spark = SparkSession.builder.appName('PySpark DataFrame From RDD').getOrCreate()

# Create PySpark DataFrame From an Existing RDD

In [32]:
rdd = sc.parallelize([('C',85,76,87,91), ('B',85,76,87,91), ("A", 85,78,96,92), ("A", 92,76,89,96)], 4)

In [33]:
print(type(rdd))

<class 'pyspark.rdd.RDD'>


In [34]:
sub = ['Division','English','Mathematics','Physics','Chemistry']
marks_df = spark.createDataFrame(rdd, schema=sub)

In [35]:
print(type(marks_df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [36]:
marks_df.printSchema()

root
 |-- Division: string (nullable = true)
 |-- English: long (nullable = true)
 |-- Mathematics: long (nullable = true)
 |-- Physics: long (nullable = true)
 |-- Chemistry: long (nullable = true)



In [37]:
marks_df.show()

+--------+-------+-----------+-------+---------+
|Division|English|Mathematics|Physics|Chemistry|
+--------+-------+-----------+-------+---------+
|       C|     85|         76|     87|       91|
|       B|     85|         76|     87|       91|
|       A|     85|         78|     96|       92|
|       A|     92|         76|     89|       96|
+--------+-------+-----------+-------+---------+



# Create PySpark DataFrame From an External File

In [38]:
from pyspark.sql import SparkSession

In [39]:
spark = SparkSession.builder.appName('PySpark DataFrame From External Files').getOrCreate()

In [40]:
csv_file = spark.read.csv('Fish.csv', sep = ',', inferSchema = True, header = True)

In [41]:
txt_file = spark.read.text("example.txt")

In [42]:
json_file = spark.read.json("sample.json", multiLine=True)

In [43]:
print(type(csv_file))

<class 'pyspark.sql.dataframe.DataFrame'>


In [44]:
print(type(txt_file))

<class 'pyspark.sql.dataframe.DataFrame'>


In [45]:
print(type(json_file))

<class 'pyspark.sql.dataframe.DataFrame'>


In [46]:
csv_file.printSchema()

root
 |-- Species: string (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Length1: double (nullable = true)
 |-- Length2: double (nullable = true)
 |-- Length3: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Width: double (nullable = true)



In [47]:
txt_file.printSchema()

root
 |-- value: string (nullable = true)



In [48]:
json_file.printSchema()

root
 |-- employees: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- firstName: string (nullable = true)
 |    |    |-- lastName: string (nullable = true)



# PySpark DataFrame to Pandas DataFrame

In [49]:
df = csv_file.toPandas()

In [50]:
type(df)

pandas.core.frame.DataFrame