# Schemas

Schema defines column names and what data type they are. For example, columns can have integer, string, date types etc. The different data types can be imported from the `pyspark.sql.types`.

## Import Libraries

In [None]:
import os

import findspark

# To display dataframes which can scroll horizontally
from IPython.core.display import HTML
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_timestamp
from pyspark.sql.types import (
    BooleanType,
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
    TimestampType,
)

findspark.init()

## SparkSession

In [None]:
spark = SparkSession.builder.master("local").config(conf=SparkConf()).getOrCreate()

## Load the data

In [None]:
data_path = "file:///" + os.getcwd() + "/data"

file_path = data_path + "/reported-crimes.csv"

reported_crimes_df = (
    spark.read.csv(file_path, header=True)
    .withColumn("Date", to_timestamp(col("Date"), "MM/dd/yyyy hh:mm:ss a"))
    .filter(col("Date") <= lit("2018-11-11"))
)

reported_crimes_df.show(5)

In [None]:
reported_crimes_df.printSchema()

## Load with schema

In [None]:
reported_crimes_df.columns

In [None]:
labels = [
    ("ID", StringType()),
    ("Case Number", StringType()),
    ("Date", TimestampType()),
    ("Block", StringType()),
    ("IUCR", StringType()),
    ("Primary Type", StringType()),
    ("Description", StringType()),
    ("Location Description", StringType()),
    ("Arrest", StringType()),
    ("Domestic", BooleanType()),
    ("Beat", StringType()),
    ("District", StringType()),
    ("Ward", StringType()),
    ("Community Area", StringType()),
    ("FBI Code", StringType()),
    ("X Coordinate", StringType()),
    ("Y Coordinate", StringType()),
    ("Year", IntegerType()),
    ("Updated On", StringType()),
    ("Latitude", DoubleType()),
    ("Longitude", DoubleType()),
    ("Location", StringType()),
]

schema = StructType([StructField(x[0], x[1], True) for x in labels])
schema

In [None]:
new_reported_crimes_df = spark.read.csv(file_path, schema=schema)

new_reported_crimes_df.printSchema()

In [None]:
new_reported_crimes_df.show(5)