In [None]:
# Import required libraries
from pyspark.sql import SparkSession

# Create SparkSession - the entry point to all Spark functionality
spark = SparkSession \
    .builder \
    .appName("Spark SQL Getting Started") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Display Spark version
print(f"Spark Version: {spark.version}")

# Display SparkSession info
print(f"Application Name: {spark.conf.get('spark.app.name')}")
print(f"Master: {spark.conf.get('spark.master')}")


In [None]:
# Method 1: Create DataFrame from list of tuples
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["name", "age"]

df_from_tuples = spark.createDataFrame(data, columns)
print("DataFrame from tuples:")
df_from_tuples.show()

# Method 2: Create DataFrame from list of dictionaries
data_dict = [
    {"name": "Alice", "age": 25, "city": "New York"},
    {"name": "Bob", "age": 30, "city": "San Francisco"},
    {"name": "Charlie", "age": 35, "city": "Chicago"}
]

df_from_dict = spark.createDataFrame(data_dict)
print("\nDataFrame from dictionaries:")
df_from_dict.show()


In [None]:
# Print schema information
print("Schema of df_from_tuples:")
df_from_tuples.printSchema()

print("\nSchema of df_from_dict:")
df_from_dict.printSchema()

# Show DataFrame info
print(f"\nNumber of rows in df_from_dict: {df_from_dict.count()}")
print(f"Number of columns in df_from_dict: {len(df_from_dict.columns)}")
print(f"Column names: {df_from_dict.columns}")


In [None]:
# First, let's create sample data files for practice
import json
import pandas as pd
import os

# Create data directory if it doesn't exist
os.makedirs("../data", exist_ok=True)

# Sample data
sample_data = [
    {"name": "Michael", "age": None},
    {"name": "Andy", "age": 30},
    {"name": "Justin", "age": 19}
]

# Create JSON file
with open("../data/people.json", "w") as f:
    for record in sample_data:
        f.write(json.dumps(record) + "\n")

# Create CSV file
pd.DataFrame(sample_data).to_csv("../data/people.csv", index=False)

print("Sample data files created successfully!")


In [None]:
# Read JSON file
df_json = spark.read.json("../data/people.json")
print("DataFrame from JSON:")
df_json.show()

# Read CSV file
df_csv = spark.read.option("header", "true").option("inferSchema", "true").csv("../data/people.csv")
print("\nDataFrame from CSV:")
df_csv.show()

# Compare schemas
print("\nJSON DataFrame schema:")
df_json.printSchema()

print("\nCSV DataFrame schema:")
df_csv.printSchema()


In [None]:
# Use the JSON DataFrame for operations
df = df_json

# 1. Select specific columns
print("1. Select only 'name' column:")
df.select("name").show()

# 2. Select multiple columns with expressions
print("\n2. Select name and age + 1:")
df.select(df['name'], df['age'] + 1).show()

# 3. Filter rows
print("\n3. Filter people older than 21:")
df.filter(df['age'] > 21).show()

# 4. Group by and count
print("\n4. Count people by age:")
df.groupBy("age").count().show()


In [None]:
# 5. Add new columns
print("5. Add a new column 'is_adult':")
df_with_adult = df.withColumn("is_adult", df['age'] >= 18)
df_with_adult.show()

# 6. Rename columns
print("\n6. Rename 'name' to 'full_name':")
df_renamed = df.withColumnRenamed("name", "full_name")
df_renamed.show()

# 7. Sort data
print("\n7. Sort by age (ascending):")
df.orderBy("age").show()

print("\n8. Sort by age (descending):")
df.orderBy(df['age'].desc()).show()


In [None]:
# Exercise 1: Create your books DataFrame here
# TODO: Create a DataFrame with at least 5 books
# Include columns: title, author, year_published, rating (1-5)

books_data = [
    # Add your data here
]

# Create DataFrame and show it
# df_books = spark.createDataFrame(books_data, ["title", "author", "year_published", "rating"])
# df_books.show()


In [None]:
# Exercise 2: DataFrame Operations
# TODO: Complete the following operations

# 1. Select only title and rating columns
# your_code_here

# 2. Filter books with rating >= 4
# your_code_here

# 3. Add a new column 'age_of_book' (current year - year_published)
# your_code_here

# 4. Sort books by rating in descending order
# your_code_here

# 5. Group by author and count the number of books
# your_code_here


In [None]:
# Exercise 3: File Operations
# TODO: Complete the following

# 1. Create employee data as a list of dictionaries
# Include: employee_id, name, department, salary
employees = [
    # Add your employee data here
]

# 2. Convert to pandas DataFrame and save as CSV
# your_code_here

# 3. Read the CSV file back using Spark
# your_code_here

# 4. Display the DataFrame and its schema
# your_code_here
