In [None]:
# Setup
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Create SparkSession
spark = SparkSession.builder.appName("DataFrame Operations").getOrCreate()

# Create sample data for practice
data = [
    ("Alice", 25, "Engineer", 75000, "2020-01-15"),
    ("Bob", 30, "Manager", 85000, "2019-03-20"),
    ("Charlie", 35, "Engineer", 80000, "2018-06-10"),
    ("Diana", 28, "Analyst", 65000, "2021-02-28"),
    ("Eve", 32, "Manager", 90000, "2017-11-05"),
    ("Frank", 29, "Engineer", 78000, "2020-09-12")
]

columns = ["name", "age", "job_title", "salary", "hire_date"]
df = spark.createDataFrame(data, columns)

print("Sample DataFrame:")
df.show()
df.printSchema()


In [None]:
# Different ways to select columns
print("1. Select single column (string):")
df.select("name").show()

print("\n2. Select multiple columns (strings):")
df.select("name", "age", "salary").show()

print("\n3. Select using column objects:")
df.select(df.name, df.age).show()

print("\n4. Select using column indexing (recommended):")
df.select(df["name"], df["age"]).show()

print("\n5. Select all columns:")
df.select("*").show()


In [None]:
# Column expressions and calculations
print("6. Mathematical operations:")
df.select(df["name"], df["salary"], (df["salary"] * 0.1).alias("bonus")).show()

print("\n7. String operations:")
df.select(df["name"], F.upper(df["name"]).alias("name_upper")).show()

print("\n8. Multiple expressions:")
df.select(
    df["name"],
    df["age"],
    (df["age"] + 5).alias("age_in_5_years"),
    (df["salary"] / 12).alias("monthly_salary")
).show()

print("\n9. Conditional expressions:")
df.select(
    df["name"],
    df["age"],
    F.when(df["age"] >= 30, "Senior").otherwise("Junior").alias("seniority")
).show()


In [None]:
# Basic filtering
print("1. Filter by age:")
df.filter(df["age"] > 30).show()

print("\n2. Filter by job title:")
df.filter(df["job_title"] == "Engineer").show()

print("\n3. Multiple conditions (AND):")
df.filter((df["age"] > 28) & (df["salary"] > 75000)).show()

print("\n4. Multiple conditions (OR):")
df.filter((df["job_title"] == "Manager") | (df["salary"] > 85000)).show()

print("\n5. String contains:")
df.filter(df["name"].contains("a")).show()

print("\n6. IN operator:")
df.filter(df["job_title"].isin(["Engineer", "Manager"])).show()


In [None]:
# Transformations (lazy - don't execute immediately)
print("=== TRANSFORMATIONS (Lazy) ===")
print("These operations don't execute until an action is called")

# Create a series of transformations
filtered_df = df.filter(df["age"] > 25)
selected_df = filtered_df.select("name", "age", "salary")
sorted_df = selected_df.orderBy("salary", ascending=False)

print("Transformations created, but not executed yet...")
print("Type of result:", type(sorted_df))

# Actions (eager - trigger execution)
print("\n=== ACTIONS (Eager) ===")
print("These operations trigger execution of all transformations")

print("\n1. show() - Display data:")
sorted_df.show()

print("\n2. count() - Count rows:")
print(f"Number of rows: {sorted_df.count()}")

print("\n3. collect() - Collect all data to driver:")
collected_data = sorted_df.collect()
print(f"Collected data type: {type(collected_data)}")
print(f"First row: {collected_data[0]}")

print("\n4. first() - Get first row:")
first_row = sorted_df.first()
print(f"First row: {first_row}")

print("\n5. take(n) - Take first n rows:")
first_two = sorted_df.take(2)
print(f"First two rows: {first_two}")


In [None]:
# String functions
print("=== STRING FUNCTIONS ===")
df.select(
    df["name"],
    F.upper(df["name"]).alias("name_upper"),
    F.lower(df["name"]).alias("name_lower"),
    F.length(df["name"]).alias("name_length"),
    F.substring(df["name"], 1, 3).alias("first_3_chars")
).show()

# Mathematical functions
print("\n=== MATHEMATICAL FUNCTIONS ===")
df.select(
    df["name"],
    df["salary"],
    F.round(df["salary"] / 12, 2).alias("monthly_salary"),
    F.sqrt(df["age"]).alias("sqrt_age"),
    F.abs(df["age"] - 30).alias("age_diff_from_30")
).show()

# Date functions (convert string to date first)
print("\n=== DATE FUNCTIONS ===")
df_with_date = df.withColumn("hire_date", F.to_date(df["hire_date"], "yyyy-MM-dd"))

df_with_date.select(
    df["name"],
    df_with_date["hire_date"],
    F.year(df_with_date["hire_date"]).alias("hire_year"),
    F.month(df_with_date["hire_date"]).alias("hire_month"),
    F.datediff(F.current_date(), df_with_date["hire_date"]).alias("days_since_hire")
).show()


In [None]:
# Grouping and aggregation
print("=== GROUPING AND AGGREGATION ===")
print("1. Group by job title and calculate statistics:")
df.groupBy("job_title").agg(
    F.count("*").alias("count"),
    F.avg("salary").alias("avg_salary"),
    F.min("age").alias("min_age"),
    F.max("age").alias("max_age")
).show()

print("\n2. Multiple grouping columns:")
df.withColumn("age_group", F.when(df["age"] < 30, "Young").otherwise("Experienced")) \
  .groupBy("job_title", "age_group") \
  .agg(F.count("*").alias("count"), F.avg("salary").alias("avg_salary")) \
  .show()

# Window functions
print("\n=== WINDOW FUNCTIONS ===")
from pyspark.sql.window import Window

# Define window specifications
window_spec = Window.partitionBy("job_title").orderBy("salary")
window_all = Window.orderBy("salary")

df.select(
    df["name"],
    df["job_title"],
    df["salary"],
    F.row_number().over(window_spec).alias("rank_in_job"),
    F.rank().over(window_all).alias("overall_rank"),
    F.lag(df["salary"], 1).over(window_all).alias("prev_salary")
).show()


In [None]:
# Exercise 1: Create a new DataFrame with sales data
sales_data = [
    ("Product A", "Electronics", 1500, 10),
    ("Product B", "Clothing", 800, 25),
    ("Product C", "Electronics", 2000, 5),
    ("Product D", "Books", 300, 50),
    ("Product E", "Clothing", 1200, 15),
    ("Product F", "Electronics", 1800, 8)
]

sales_df = spark.createDataFrame(sales_data, ["product_name", "category", "price", "quantity"])
sales_df.show()

# TODO: Complete the following exercises using the sales_df

print("Exercise 1: Select product_name and calculate total_value (price * quantity)")
# Your code here

print("\nExercise 2: Filter products with price > 1000")
# Your code here

print("\nExercise 3: Add a price_category column: 'Expensive' if price > 1500, else 'Affordable'")
# Your code here

print("\nExercise 4: Group by category and calculate average price and total quantity")
# Your code here

print("\nExercise 5: Find the most expensive product in each category")
# Your code here
