In [30]:
from pyspark.sql import SparkSession 
from functools import reduce
from pyspark.sql.functions import col, when, sum, avg, explode, split

In [15]:
spark_session = SparkSession.builder.getOrCreate() 

# Q1
Implement a PySpark script that applies transformations like filter and withColumn on a
DataFrame.

In [17]:
df=csv_file = spark_session.read.csv( 
        'data.csv', 
        sep = ',', inferSchema = True, 
        header = True)
print("Original Dataframe:")
df.show()

filtered_df = df.filter(col("age") > 30)
print("Filtered DataFrame (Age > 30):")
filtered_df.show()

df_with_age_category = df.withColumn(
    "Age Category",
    when(col("Age") > 40, "Senior")
    .otherwise("Adult")
)
print("DataFrame with Age Category:")
df_with_age_category.show()

Original Dataframe:
+---+-------------+---+-----------+------+
| id|         name|age| department|salary|
+---+-------------+---+-----------+------+
|  1|     John Doe| 29|Engineering| 75000|
|  2|   Jane Smith| 34|  Marketing| 68000|
|  3|Emily Johnson| 45|    Finance| 90000|
|  4|Michael Brown| 28|Engineering| 72000|
|  5|  Linda Davis| 39|         HR| 71000|
+---+-------------+---+-----------+------+

Filtered DataFrame (Age > 30):
+---+-------------+---+----------+------+
| id|         name|age|department|salary|
+---+-------------+---+----------+------+
|  2|   Jane Smith| 34| Marketing| 68000|
|  3|Emily Johnson| 45|   Finance| 90000|
|  5|  Linda Davis| 39|        HR| 71000|
+---+-------------+---+----------+------+

DataFrame with Age Category:
+---+-------------+---+-----------+------+------------+
| id|         name|age| department|salary|Age Category|
+---+-------------+---+-----------+------+------------+
|  1|     John Doe| 29|Engineering| 75000|       Adult|
|  2|   Jane 

# Q2
Write a PySpark script that performs actions like count and show on a DataFrame.

In [19]:
print("DataFrame:")
df.show()

row_count = df.count()
print(f"Number of rows in DataFrame: {row_count}")


DataFrame:
+---+-------------+---+-----------+------+
| id|         name|age| department|salary|
+---+-------------+---+-----------+------+
|  1|     John Doe| 29|Engineering| 75000|
|  2|   Jane Smith| 34|  Marketing| 68000|
|  3|Emily Johnson| 45|    Finance| 90000|
|  4|Michael Brown| 28|Engineering| 72000|
|  5|  Linda Davis| 39|         HR| 71000|
+---+-------------+---+-----------+------+

Number of rows in DataFrame: 5


# Q3
Demonstrate how to perform basic aggregations (e.g., sum, average) on a PySpark
DataFrame.

In [24]:
aggregations = df.agg(
    sum(col("salary")).alias("Total_Salary"),
    avg(col("salary")).alias("Average_Salary")
)

# Show the aggregation results
print("Aggregation Results:")
aggregations.show()

Aggregation Results:
+------------+--------------+
|Total_Salary|Average_Salary|
+------------+--------------+
|      376000|       75200.0|
+------------+--------------+



# Q4
Show how to write a PySpark DataFrame to a CSV file.

In [26]:
output_path = "output_data.csv"
df.write.mode("overwrite").csv(output_path, header=True)

print(f"DataFrame written to CSV file at: {output_path}")

DataFrame written to CSV file at: output_data.csv


# Q5
Implement wordcount program in PySpark.

In [40]:
input_path = 'file.txt'
df = spark_session.read.text(input_path)

words_df = df.withColumn("word", explode(split(col("value"), " ")))

word_count_df = words_df.groupBy("word").count()

print("Word Count:")
word_count_df.show()

Word Count:
+-------------+-----+
|         word|count|
+-------------+-----+
|        eros.|    1|
|    porttitor|    3|
|    eleifend.|    1|
|        curae|    1|
|         odio|    4|
|     volutpat|    2|
|     sociosqu|    1|
|     interdum|    2|
|      pretium|    2|
|    himenaeos|    1|
|         odor|    1|
|          Sem|    1|
|    hendrerit|    3|
|     sagittis|    4|
|pellentesque.|    1|
|       curae.|    1|
|        netus|    2|
|       morbi.|    1|
|        lorem|    1|
|          nam|    1|
+-------------+-----+
only showing top 20 rows



In [42]:
spark_session.stop()