### Read Data

- CSV / JSON / Parquet

- Schema inference vs explicit schema

- Handling bad records

In [0]:
schema = "id int, customer_key string, first_name string, last_name string, company_name string, city string, country string, primary_phone string, secondary_phone string, email string, registration_date string, website_url string"



In [0]:
df_csv = (
  spark.read
  .format("csv")
  .schema(schema)
  .option("inferSchema","true")
  .option("header","false")
  .option("delimiter",",")
  .option("mode","PERMISSIVE") #mode = PERMISSIVE | DROPMALFORMED | FAILFAST
  .load("/Volumes/pyspark/bronze/raw_ingestion/csv/customers/")
)

# quote, escape


In [0]:
df_csv.display()

In [0]:
df_csv.printSchema()

In [0]:
df_csv.selectExpr("id", "customer_key as ckey","first_name").display()

In [0]:
from pyspark.sql.functions import col, expr

(
    df_csv.select(
        col("city"),
        col("company_name").alias("cname"),
        col("id").cast("string")
    ).printSchema()
)

In [0]:
df_csv.select(expr("city")).limit(2).display()

In [0]:
from pyspark.sql.functions import lit, when
( df_csv.withColumn("number_1", 2*col("id"))
 .withColumn("new_col", expr("""
                             case when id = 1
                             then 'one'
                             when id = 2
                             then 'two'
                             else 'other'
                             end"""))
 
 .limit(2).display()

)

In [0]:
from pyspark.sql.functions import lit, when

df_csv.select("*", when(col("id") == 1, lit("one")).when(col("id") == 2, lit("two")).otherwise(lit("other")).alias("hi")).display()

In [0]:
import pyspark.sql.functions as F

columns = {
    "id2": F.concat(F.col("first_name"), F.col("last_name")),
    "registration_month": F.month("registration_date"),
    "registration_year": F.year("registration_date"),
    "registration_day": F.dayofmonth("registration_date"),
    "registration_hour": F.hour("registration_date")
}

df_csv.withColumns(columns).limit(3).display()


In [0]:
from pyspark.sql.functions import concat
df_csv.withColumn("full_name", when(col("id") == 1, concat(col("first_name"), col("last_name")))
                  .when(col("id") == 2, concat(col("first_name"), lit(" "), col("last_name")))
                  .otherwise(col("first_name"))
                  ).limit(2).display()

In [0]:
df_json = (
    spark.read
    .format("json")
    .option("multiline","true")
    .load("/Volumes/pyspark/bronze/raw_ingestion/json/forbes_billionaires_list/")
)
df_json.display()

In [0]:
df_parquet = (
    spark.read
    .format("parquet")
    .load("/Volumes/pyspark/bronze/raw_ingestion/parquet/")
)



In [0]:
df_json.printSchema()

In [0]:
df_csv.printSchema()

In [0]:
df_csv.printSchema()

In [0]:
from pyspark.sql.types import _parse_datatype_string as p

new_schema = p(schema)