In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Initialise our Spark session
spark = SparkSession.builder.getOrCreate()


In [6]:
# Load simple CSV file
df = spark.read.format("csv")\
    .load("../data/penguins.csv")
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)



In [8]:
df = spark.read.format("csv")\
    .option('inferSchema", True)\
    .option("header', True)\
    .load("../data/penguins.csv")
df.printSchema()

root
 |-- species: string (nullable = true)
 |-- island: string (nullable = true)
 |-- bill_length_mm: double (nullable = true)
 |-- bill_depth_mm: double (nullable = true)
 |-- flipper_length_mm: integer (nullable = true)
 |-- body_mass_g: integer (nullable = true)
 |-- sex: string (nullable = true)



In [9]:
# Make a nice representation of our data
df.show()

+-------+---------+--------------+-------------+-----------------+-----------+------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|
+-------+---------+--------------+-------------+-----------------+-----------+------+
| Adelie|Torgersen|          39.1|         18.7|              181|       3750|  MALE|
| Adelie|Torgersen|          39.5|         17.4|              186|       3800|FEMALE|
| Adelie|Torgersen|          40.3|         18.0|              195|       3250|FEMALE|
| Adelie|Torgersen|          null|         null|             null|       null|  null|
| Adelie|Torgersen|          36.7|         19.3|              193|       3450|FEMALE|
| Adelie|Torgersen|          39.3|         20.6|              190|       3650|  MALE|
| Adelie|Torgersen|          38.9|         17.8|              181|       3625|FEMALE|
| Adelie|Torgersen|          39.2|         19.6|              195|       4675|  MALE|
| Adelie|Torgersen|          34.1|         18.1|      

In [16]:
# Load data using Spark RDD API
sc = spark._sc
rdd = sc.textFile("../data/penguins.csv")

In [17]:
print("First two elements of the RDD are: {}\n".format(rdd.take(2)))

First two elements of the RDD are: ['species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex', 'Adelie,Torgersen,39.1,18.7,181,3750,MALE']



In [19]:
# First row contains column names
names = rdd.map(lambda x: x.split(",")).take(1)[0]

# Make it a DataFrame
# 1. remove the first line which contains metadata
# 2. split row elements into columns
# 3. Assign correct data type
# 4. Promote to DataFrame
df = rdd\
    .filter(lambda x: 'species' not in x)\
    .map(lambda x: x.split(","))\
    .toDF(names)
df.printSchema()

root
 |-- species: string (nullable = true)
 |-- island: string (nullable = true)
 |-- bill_length_mm: string (nullable = true)
 |-- bill_depth_mm: string (nullable = true)
 |-- flipper_length_mm: string (nullable = true)
 |-- body_mass_g: string (nullable = true)
 |-- sex: string (nullable = true)



In [20]:
## Generating parquet data with pandas

import pandas as pd
df = pd.read_csv("../data/penguins.csv")
df.to_parquet("../data/penguins.parquet")

In [22]:
# Same using Parquet - Note that the schema and the data types 
# are directly inferred. (use long for int by default).
df_parquet = spark.read.format("parquet").load("../data/penguins.parquet")
df_parquet.printSchema()
df_parquet.show()

root
 |-- species: string (nullable = true)
 |-- island: string (nullable = true)
 |-- bill_length_mm: double (nullable = true)
 |-- bill_depth_mm: double (nullable = true)
 |-- flipper_length_mm: double (nullable = true)
 |-- body_mass_g: double (nullable = true)
 |-- sex: string (nullable = true)

+-------+---------+--------------+-------------+-----------------+-----------+------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|
+-------+---------+--------------+-------------+-----------------+-----------+------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  MALE|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|FEMALE|
| Adelie|Torgersen|          40.3|         18.0|            195.0|     3250.0|FEMALE|
| Adelie|Torgersen|          null|         null|             null|       null|  null|
| Adelie|Torgersen|          36.7|         19.3|            193.0|     3450.0|FEMALE|
| Adelie|To

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [37]:
## Loading JSON data

df_json = spark.read.option("multiline", True)\
    .format("json")\
    .load("../data/penguins.json")

In [38]:
df_json.printSchema()

root
 |-- bill_depth_mm: double (nullable = true)
 |-- bill_length_mm: double (nullable = true)
 |-- body_mass_g: double (nullable = true)
 |-- flipper_length_mm: double (nullable = true)
 |-- island: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- species: string (nullable = true)

