In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [2]:
spark = SparkSession.builder \
                    .appName("Schema Inference And Definition") \
                    .master('local[*]') \
                    .getOrCreate()

In [3]:
spark

### Auto Schema Inference

In [4]:
heart_dataframe_inferred = spark.read \
                                .format('csv') \
                                .option('header', 'true') \
                                .option('inferSchema', 'true') \
                                .load('../Data/Heart Disease UCI.csv')

In [5]:
heart_dataframe_inferred

DataFrame[age: int, sex: int, cp: int, trestbps: int, chol: int, fbs: int, restecg: int, thalach: int, exang: int, oldpeak: double, slope: int, ca: int, thal: int, condition: int]

In [6]:
heart_dataframe_inferred.columns

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'condition']

In [7]:
heart_dataframe_inferred.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- condition: integer (nullable = true)



### Manual Schema Definition

In [8]:
manual_schema = StructType([
    StructField("age", IntegerType(), True),
    StructField("sex", StringType(), True),
    StructField("cp", StringType(), True),
    StructField("trestbps", IntegerType(), True),
    StructField("chol", IntegerType(), True),
    StructField("fbs", StringType(), True),
    StructField("restecg", StringType(), True),
    StructField("thalach", IntegerType(), True),
    StructField("exang", StringType(), True),
    StructField("oldpeak", FloatType(), True),
    StructField("slope", StringType(), True),
    StructField("ca", StringType(), True),
    StructField("thal", StringType(), True),
    StructField("target", StringType(), True)
])

In [9]:
heart_dataframe_manual = spark.read \
                                .format('csv') \
                                .option('header', 'true') \
                                .option('inferSchema', 'false') \
                                .schema(manual_schema) \
                                .load('../Data/Heart Disease UCI.csv')

In [10]:
heart_dataframe_manual

DataFrame[age: int, sex: string, cp: string, trestbps: int, chol: int, fbs: string, restecg: string, thalach: int, exang: string, oldpeak: float, slope: string, ca: string, thal: string, target: string]

In [11]:
heart_dataframe_manual.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- cp: string (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: string (nullable = true)
 |-- restecg: string (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: string (nullable = true)
 |-- oldpeak: float (nullable = true)
 |-- slope: string (nullable = true)
 |-- ca: string (nullable = true)
 |-- thal: string (nullable = true)
 |-- target: string (nullable = true)

