## Session Initialization

In [1]:
from pathlib import Path
import pyspark
from pyspark import SparkContext

sc = SparkContext()
spark = pyspark.sql.SparkSession(sc, jsparkSession=None)

## Read CSV (without schema)

In [2]:
dir = Path.cwd().parent / "Files" / "bookcontents.csv"
file = str(dir)
bookDF = spark.read.csv(file)

In [3]:
bookDF.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [4]:
bookDF.show()

+-------+--------------------+----+
|    _c0|                 _c1| _c2|
+-------+--------------------+----+
|Chapter|                Name|Page|
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



## Read CSV with Header

In [5]:
bookHeaderDF = spark.read.option("header", "True").csv(file)

In [6]:
bookHeaderDF.printSchema()

root
 |-- Chapter: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Page: string (nullable = true)



In [7]:
bookHeaderDF.show()

+-------+--------------------+----+
|Chapter|                Name|Page|
+-------+--------------------+----+
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



## Read CSV with Header & Inference

In [8]:
bookInfHeaderDF = spark.read.option("InferSchema", "True").option("Header", "True").csv(file)

In [9]:
bookInfHeaderDF.printSchema()

root
 |-- Chapter: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Page: integer (nullable = true)



In [10]:
bookInfHeaderDF.show()

+-------+--------------------+----+
|Chapter|                Name|Page|
+-------+--------------------+----+
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



## Manual Schema

In [11]:
from pyspark.sql.types import *

In [12]:
col = [
    StructField("Chapter", IntegerType()),
    StructField("Name", StringType()),
    StructField("Page", FloatType())
]

csvSchema = StructType(col)

In [13]:
manualShemaDF = spark.read.schema(csvSchema).csv(file)

In [14]:
manualShemaDF.printSchema()

root
 |-- Chapter: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Page: float (nullable = true)



In [15]:
manualShemaDF.show()

+-------+--------------------+----+
|Chapter|                Name|Page|
+-------+--------------------+----+
|   null|                Name|null|
|      1|        Introduction|11.0|
|      2|Basic Engineering...|19.0|
|      3|Advanced Engineer...|28.0|
|      4|     Hands On Course|60.0|
|      5|        Case Studies|62.0|
|      6|Best Practices Cl...|73.0|
|      7|130+ Data Sources...|77.0|
|      8|1001 Interview Qu...|82.0|
|      9|Recommended Books...|87.0|
+-------+--------------------+----+



In [16]:
manualShemaDF.where("Page is not null").show()

+-------+--------------------+----+
|Chapter|                Name|Page|
+-------+--------------------+----+
|      1|        Introduction|11.0|
|      2|Basic Engineering...|19.0|
|      3|Advanced Engineer...|28.0|
|      4|     Hands On Course|60.0|
|      5|        Case Studies|62.0|
|      6|Best Practices Cl...|73.0|
|      7|130+ Data Sources...|77.0|
|      8|1001 Interview Qu...|82.0|
|      9|Recommended Books...|87.0|
+-------+--------------------+----+

