In [None]:
# Setup
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import *
from pyspark.sql import functions as F

# Create SparkSession
spark = SparkSession.builder.appName("Schema Specification").getOrCreate()
sc = spark.sparkContext

print("SparkSession and SparkContext created successfully!")
print(f"Spark Version: {spark.version}")


In [None]:
# Create RDD from text data (simulating reading from a file)
text_data = [
    "Alice,25,Engineer",
    "Bob,30,Manager",
    "Charlie,35,Engineer",
    "Diana,28,Analyst"
]

# Create RDD
lines_rdd = sc.parallelize(text_data)
print("Original RDD:")
print(lines_rdd.collect())

# Parse the RDD to create structured data
def parse_line(line):
    parts = line.split(",")
    return (parts[0], int(parts[1]), parts[2])

parsed_rdd = lines_rdd.map(parse_line)
print("\nParsed RDD:")
print(parsed_rdd.collect())

# Convert RDD to DataFrame with automatic schema inference
df_inferred = spark.createDataFrame(parsed_rdd, ["name", "age", "job"])
print("\nDataFrame with inferred schema:")
df_inferred.show()
df_inferred.printSchema()


In [None]:
# Define schema programmatically
custom_schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("job", StringType(), True)
])

print("Custom schema definition:")
print(custom_schema)

# Create DataFrame with custom schema
df_custom_schema = spark.createDataFrame(parsed_rdd, custom_schema)
print("\nDataFrame with custom schema:")
df_custom_schema.show()
df_custom_schema.printSchema()

# More complex schema example
complex_schema = StructType([
    StructField("employee_id", IntegerType(), False),  # Not nullable
    StructField("personal_info", StructType([
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("email", StringType(), True)
    ]), True),
    StructField("job_details", StructType([
        StructField("title", StringType(), True),
        StructField("salary", DoubleType(), True),
        StructField("start_date", DateType(), True)
    ]), True),
    StructField("skills", ArrayType(StringType()), True)
])

print("\nComplex nested schema:")
print(complex_schema.simpleString())


In [None]:
# Demonstrate various data types
from datetime import date, datetime
from decimal import Decimal

# Create sample data with different types
sample_data = [
    Row(
        id=1,
        name="Alice",
        salary=75000.50,
        is_active=True,
        hire_date=date(2020, 1, 15),
        last_login=datetime(2024, 1, 10, 14, 30, 0),
        bonus=Decimal("5000.25"),
        skills=["Python", "SQL", "Spark"],
        metadata={"department": "Engineering", "level": "Senior"}
    ),
    Row(
        id=2,
        name="Bob",
        salary=85000.75,
        is_active=False,
        hire_date=date(2019, 3, 20),
        last_login=datetime(2024, 1, 9, 9, 15, 0),
        bonus=Decimal("7500.00"),
        skills=["Java", "Scala", "Kafka"],
        metadata={"department": "Engineering", "level": "Lead"}
    )
]

# Create DataFrame from Row objects (schema inferred)
df_various_types = spark.createDataFrame(sample_data)
print("DataFrame with various data types:")
df_various_types.show(truncate=False)
df_various_types.printSchema()

# Define explicit schema for the same data
explicit_schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("salary", DoubleType(), True),
    StructField("is_active", BooleanType(), True),
    StructField("hire_date", DateType(), True),
    StructField("last_login", TimestampType(), True),
    StructField("bonus", DecimalType(10, 2), True),
    StructField("skills", ArrayType(StringType()), True),
    StructField("metadata", MapType(StringType(), StringType()), True)
])

print("\nExplicit schema for various data types:")
print(explicit_schema.simpleString())


In [None]:
# Schema validation examples
print("=== SCHEMA VALIDATION ===")

# Define a strict schema
strict_schema = StructType([
    StructField("id", IntegerType(), False),  # Not nullable
    StructField("name", StringType(), False),  # Not nullable
    StructField("salary", DoubleType(), True)
])

# Valid data
valid_data = [(1, "Alice", 75000.0), (2, "Bob", 85000.0)]
df_valid = spark.createDataFrame(valid_data, strict_schema)
print("Valid data with strict schema:")
df_valid.show()

# Try to create DataFrame with invalid data (this will work but may cause issues later)
try:
    invalid_data = [(1, "Alice", 75000.0), (2, None, 85000.0)]  # None in non-nullable field
    df_invalid = spark.createDataFrame(invalid_data, strict_schema)
    print("DataFrame created with invalid data:")
    df_invalid.show()  # This might fail or show unexpected results
except Exception as e:
    print(f"Error: {e}")

# Schema comparison
print("\n=== SCHEMA COMPARISON ===")
schema1 = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

schema2 = StructType([
    StructField("id", LongType(), True),  # Different type
    StructField("name", StringType(), True)
])

print("Schema 1:", schema1.simpleString())
print("Schema 2:", schema2.simpleString())
print("Are schemas equal?", schema1 == schema2)

# Working with nullable vs non-nullable fields
print("\n=== NULLABLE VS NON-NULLABLE ===")
nullable_schema = StructType([
    StructField("id", IntegerType(), True),    # Nullable
    StructField("name", StringType(), False)   # Non-nullable
])

print("Nullable schema:", nullable_schema.simpleString())
for field in nullable_schema.fields:
    print(f"Field '{field.name}': nullable={field.nullable}, type={field.dataType}")


In [None]:
# Exercise data - CSV-like format
exercise_data = [
    "1,John Doe,Software Engineer,75000,2020-01-15,Python;Java;SQL",
    "2,Jane Smith,Data Scientist,85000,2019-06-20,Python;R;Machine Learning",
    "3,Bob Johnson,DevOps Engineer,80000,2021-03-10,Docker;Kubernetes;AWS",
    "4,Alice Brown,Product Manager,90000,2018-09-05,Agile;Scrum;Analytics"
]

# Create RDD from the data
exercise_rdd = sc.parallelize(exercise_data)

print("=== EXERCISE 1: Parse RDD and Create DataFrame ===")
print("Raw data:")
for line in exercise_data:
    print(line)

# TODO: Complete this exercise
print("\nTODO: Parse the RDD and create a DataFrame")
print("1. Create a function to parse each line")
print("2. Split by comma and handle the skills field (split by semicolon)")
print("3. Convert to appropriate data types")
print("4. Create DataFrame with inferred schema")

# Your code here:
# def parse_employee_line(line):
#     # Your parsing logic here
#     pass

# parsed_rdd = exercise_rdd.map(parse_employee_line)
# df_exercise1 = spark.createDataFrame(parsed_rdd, ["id", "name", "title", "salary", "hire_date", "skills"])

print("\n=== EXERCISE 2: Define Custom Schema ===")
print("TODO: Define a custom schema for the employee data")
print("Requirements:")
print("- id: Integer, not nullable")
print("- name: String, not nullable")
print("- title: String, nullable")
print("- salary: Double, nullable")
print("- hire_date: Date, nullable")
print("- skills: Array of Strings, nullable")

# Your schema definition here:
# employee_schema = StructType([
#     # Your schema fields here
# ])

print("\n=== EXERCISE 3: Create DataFrame with Custom Schema ===")
print("TODO: Create DataFrame using the custom schema you defined")
print("Handle the date conversion properly")

# Your code here:
# df_exercise3 = spark.createDataFrame(parsed_rdd, employee_schema)

print("\n=== EXERCISE 4: Schema Validation ===")
print("TODO: Create a function to validate if a DataFrame matches expected schema")

# def validate_schema(df, expected_schema):
#     # Your validation logic here
#     pass

print("\n=== EXERCISE 5: Complex Nested Schema ===")
print("TODO: Create a nested schema for employee data with:")
print("- personal_info: struct with name, email")
print("- job_info: struct with title, salary, hire_date")
print("- skills: array of strings")
print("- certifications: array of structs with name and date")

# Your nested schema here:
# nested_employee_schema = StructType([
#     # Your nested schema definition here
# ])
