In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("DataFrameTransformations").getOrCreate()

data = [("Alice", 28), ("Bob", 35), ("Charlie", 42), ("David", 22)]
df = spark.createDataFrame(data, ["Name", "Age"])

df.filter(col("Age") > 30).show()
df.withColumn("AgePlusTen", col("Age") + 10).show()

spark.stop()




+-------+---+
|   Name|Age|
+-------+---+
|    Bob| 35|
|Charlie| 42|
+-------+---+

+-------+---+----------+
|   Name|Age|AgePlusTen|
+-------+---+----------+
|  Alice| 28|        38|
|    Bob| 35|        45|
|Charlie| 42|        52|
|  David| 22|        32|
+-------+---+----------+



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataFrameActions").getOrCreate()

data = [("Alice", 28), ("Bob", 35), ("Charlie", 42), ("David", 22)]
df = spark.createDataFrame(data, ["Name", "Age"])

df.show()
print(f"Number of Rows: {df.count()}")

spark.stop()




+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 28|
|    Bob| 35|
|Charlie| 42|
|  David| 22|
+-------+---+

Number of Rows: 4


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum

spark = SparkSession.builder.appName("DataFrameAggregations").getOrCreate()

data = [("Alice", 28), ("Bob", 35), ("Charlie", 42), ("David", 22)]
df = spark.createDataFrame(data, ["Name", "Age"])

df.show()
total_age, average_age = df.agg(sum("Age"), avg("Age")).first()

print(f"Total Age: {total_age}")
print(f"Average Age: {average_age}")

spark.stop()




+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 28|
|    Bob| 35|
|Charlie| 42|
|  David| 22|
+-------+---+

Total Age: 127
Average Age: 31.75


In [5]:

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("WriteToCSV").getOrCreate()

data = [("Alice", 28), ("Bob", 35), ("Charlie", 42), ("David", 22)]
df = spark.createDataFrame(data, ["Name", "Age"])

df.coalesce(1).write.csv("/home/lplab/temp", header=True, mode="overwrite")

spark.stop()




In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, count

spark = SparkSession.builder.appName("WordCount").getOrCreate()

text_data = ["Hello world", "Hello PySpark", "World of PySpark"]

word_counts = spark.createDataFrame([(text,) for text in text_data], ["text"]).select(explode(split("text", " ")).alias("word")).groupBy("word").agg(count("*").alias("count"))

print("Word Count:")
word_counts.show()

spark.stop()



Word Count:
+-------+-----+
|   word|count|
+-------+-----+
|  Hello|    2|
|  world|    1|
|PySpark|    2|
|  World|    1|
|     of|    1|
+-------+-----+

