In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
 
'''Q1. Implement a PySpark script that applies transformations like filter and withColumn on a DataFrame.'''
 
spark=SparkSession.builder \
    .appName("Transformations") \
    .getOrCreate()
 
data = [
    ("Alice", 34, "F"),
    ("Bob", 45, "M"),
    ("Catherine", 29, "F"),
    ("David", 50, "M")
]
 
columns = ["Name", "Age", "Gender"]
df = spark.createDataFrame(data, schema=columns)
 
print("Original DataFrame:")
df.show()
 
filtered_df = df.filter(col("Age") > 30)
 
transformed_df = filtered_df.withColumn("Senior", 
                                         when(col("Age") > 40, lit("Yes"))
                                         .otherwise(lit("No")))
 
print("Transformed DataFrame:")
transformed_df.show()
 
spark.stop()

24/08/31 14:06:14 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Original DataFrame:
+---------+---+------+
|     Name|Age|Gender|
+---------+---+------+
|    Alice| 34|     F|
|      Bob| 45|     M|
|Catherine| 29|     F|
|    David| 50|     M|
+---------+---+------+

Transformed DataFrame:
+-----+---+------+------+
| Name|Age|Gender|Senior|
+-----+---+------+------+
|Alice| 34|     F|    No|
|  Bob| 45|     M|   Yes|
|David| 50|     M|   Yes|
+-----+---+------+------+



In [5]:
'''Q2.Write a PySpark script that performs actions like count and show on a DataFrame.'''
 
 
spark = SparkSession.builder \
    .appName("ActionsExample") \
    .getOrCreate()
 
columns = ["Name", "Age", "Gender"]
df = spark.createDataFrame(data, schema=columns)
 
 
print("DataFrame:")
df.show()
 
row_count = df.count()
print(f"Number of rows in the DataFrame: {row_count}")
 
 
print("DataFrame with truncate option:")
df.show(truncate=False)
 
spark.stop()

DataFrame:
+---------+---+------+
|     Name|Age|Gender|
+---------+---+------+
|    Alice| 34|     F|
|      Bob| 45|     M|
|Catherine| 29|     F|
|    David| 50|     M|
+---------+---+------+

Number of rows in the DataFrame: 4
DataFrame with truncate option:
+---------+---+------+
|Name     |Age|Gender|
+---------+---+------+
|Alice    |34 |F     |
|Bob      |45 |M     |
|Catherine|29 |F     |
|David    |50 |M     |
+---------+---+------+



In [8]:
#Q3
spark = SparkSession.builder \
    .appName("AggregationsExample") \
    .getOrCreate()
 
data = [
    ("Alice", 34, 147),
    ("Bob", 45, 789),
    ("Catherine", 29, 456),
    ("David", 50, 123)
]
 
columns = ["Name", "Age", "Salary"]
df = spark.createDataFrame(data, schema=columns)
 
 
print("DataFrame:")
df.show()
 
total_salary = df.select(sum("Salary").alias("TotalSalary")).collect()[0]["TotalSalary"]
print(f"Total Salary: {total_salary}")
 
average_salary = df.select(avg("Salary").alias("AverageSalary")).collect()[0]["AverageSalary"]
print(f"Average Salary: {average_salary}")
 
spark.stop()

DataFrame:
+---------+---+------+
|     Name|Age|Salary|
+---------+---+------+
|    Alice| 34|   147|
|      Bob| 45|   789|
|Catherine| 29|   456|
|    David| 50|   123|
+---------+---+------+

Total Salary: 1515
Average Salary: 378.75


In [9]:
'''Q4. Show how to write a PySpark DataFrame to a CSV file.'''
 
spark = SparkSession.builder \
    .appName("WriteToCSVExample") \
    .getOrCreate()
 
data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
columns = ["Name", "Age"]
 
df = spark.createDataFrame(data, columns)
 
output_path = "/home/chris/220962344/Week3/test"
 
df.show()
 
df.write \
    .mode("overwrite") \
    .csv(output_path)
 
spark.stop()
 

+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
+-----+---+



In [10]:
#Q5
 
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import SparkContext
from pyspark import SparkConf
 
#Using DataFrame
spark = SparkSession.builder.getOrCreate()
lines = spark.read.text("/home/chris/220962344/Week3/text.txt")
words = lines.withColumn("word",F.explode(F.split(F.col('value'),' '))).groupby('word').count().sort('count',ascending = False).show()
 
#Using RDDs
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
lines = sc.textFile("/home/chris/220962344/Week3/text.txt")
counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word,1)).reduceByKey(lambda x,y:x+y)
 
output = counts.collect()
 
for word,count in output:
    print("%s,%i"%(word,count))

+----+-----+
|word|count|
+----+-----+
|  is|    2|
|  it|    2|
|what|    1|
+----+-----+

is,2
it,2
what,1
