# Lab 2 - Simple PySpark Programs #

In [14]:
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.appName("Lab2").getOrCreate()



#### Q1) Implement a PySpark script that applies transformations like filter and withColumn on a Dataframe


In [15]:
spark = SparkSession.builder.appName("Transformations").getOrCreate()
data = [
    (1, "John", 25, "New York"),
    (2, "Alice", 30, "San Francisco"),
    (3, "Bob", 20, "New York"),
    (4, "Mary", 35, "Chicago")
]
columns = ["id", "name", "age", "city"]
df = spark.createDataFrame(data).toDF(*columns)
print("Original DataFrame:")
df.show()
filtered_df = df.filter(col("age") > 25)
print("\nDataFrame after applying filter transformation:")
filtered_df.show()
df_with_column = df.withColumn("is_adult", when(col("age") >= 18, "Yes").otherwise("No"))
print("\nDataFrame after applying withColumn transformation:")
df_with_column.show()
spark.stop()

Original DataFrame:
+---+-----+---+-------------+
| id| name|age|         city|
+---+-----+---+-------------+
|  1| John| 25|     New York|
|  2|Alice| 30|San Francisco|
|  3|  Bob| 20|     New York|
|  4| Mary| 35|      Chicago|
+---+-----+---+-------------+


DataFrame after applying filter transformation:
+---+-----+---+-------------+
| id| name|age|         city|
+---+-----+---+-------------+
|  2|Alice| 30|San Francisco|
|  4| Mary| 35|      Chicago|
+---+-----+---+-------------+


DataFrame after applying withColumn transformation:
+---+-----+---+-------------+--------+
| id| name|age|         city|is_adult|
+---+-----+---+-------------+--------+
|  1| John| 25|     New York|     Yes|
|  2|Alice| 30|San Francisco|     Yes|
|  3|  Bob| 20|     New York|     Yes|
|  4| Mary| 35|      Chicago|     Yes|
+---+-----+---+-------------+--------+



#### Q2) Write a PySpark script that performs actions like count and show on a DataFrame.

In [16]:
spark = SparkSession.builder.appName("DataFrame Actions").getOrCreate()
data = [("John", 25, "New York"),
        ("Mary", 31, "San Francisco"),
        ("David", 42, "New York")]
columns = ["Name", "Age", "City"]
df = spark.createDataFrame(data).toDF(*columns)
count = df.count()
print("Count: ", count)
df.show()
spark.stop()



Count:  3
+-----+---+-------------+
| Name|Age|         City|
+-----+---+-------------+
| John| 25|     New York|
| Mary| 31|San Francisco|
|David| 42|     New York|
+-----+---+-------------+



#### Q3) Demonstrate how to perform basic aggregations (e.g., sum, average) on a PySpark DataFrame

In [17]:
spark = SparkSession.builder.appName("DataFrame Aggregations").getOrCreate()
data = [("John", 25, 10000.0),
        ("John", 25, 20000.0),
        ("Mary", 31, 30000.0),
        ("Mary", 31, 40000.0),
        ("David", 42, 50000.0)]
columns = ["Name", "Age", "Salary"]
df = spark.createDataFrame(data).toDF(*columns)
sum_salary = df.agg(F.sum("Salary")).collect()[0][0]
print("Sum of Salaries: ", sum_salary)
avg_salary = df.agg(F.avg("Salary")).collect()[0][0]
print("Average of Salaries: ", avg_salary)
df.groupBy("Name").agg(F.sum("Salary").alias("Total Salary"), F.avg("Salary").alias("Average Salary")).show()
df.groupBy("Age").agg(F.sum("Salary").alias("Total Salary"), F.avg("Salary").alias("Average Salary")).show()
spark.stop()



Sum of Salaries:  150000.0
Average of Salaries:  30000.0
+-----+------------+--------------+
| Name|Total Salary|Average Salary|
+-----+------------+--------------+
| John|     30000.0|       15000.0|
| Mary|     70000.0|       35000.0|
|David|     50000.0|       50000.0|
+-----+------------+--------------+

+---+------------+--------------+
|Age|Total Salary|Average Salary|
+---+------------+--------------+
| 25|     30000.0|       15000.0|
| 31|     70000.0|       35000.0|
| 42|     50000.0|       50000.0|
+---+------------+--------------+



#### Q4) Show how to write a PySpark DataFrame to a CSV file.

In [18]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrame to CSV").getOrCreate()
data = [("John", 25, "New York"),
        ("Mary", 31, "San Francisco"),
        ("David", 42, "New York")]
columns = ["Name", "Age", "City"]
df = spark.createDataFrame(data).toDF(*columns)
df.write.csv("people.csv", header=True)
spark.stop()



#### Q5) Implement wordcount program in PySpark.

In [20]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Word Count").getOrCreate()
text_file = spark.sparkContext.textFile("input.txt")
words = text_file.flatMap(lambda line: line.split(" "))
words_lower = words.map(lambda word: word.lower())
words_non_empty = words_lower.filter(lambda word: word != "")
word_counts = words_non_empty.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
for word, count in word_counts.collect():
    print(f"{word}: {count}")
spark.stop()



name: 1
is: 2
arnav: 1
i: 2
am: 2
saviour: 1
protector: 1
of: 1
this: 1
hadoop: 2
are: 1
hadoop.: 1
my: 1
karnik: 1
and: 2
the: 1
world.: 1
you: 1
everyone: 1
