In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [None]:
spark = SparkSession.builder.appName('test').getOrCreate()

In [None]:
data = [("Deepesh", 25, "Kanpur"),
        ("Palak", 26, "Delhi"),
        ("Ananya", 24, "Noida"),
        ("Aditya", 23, "Bengaluru")]
print(type(data))

<class 'list'>


In [None]:
columns= ["Name", "Age", "City"]
print(type(columns))

<class 'list'>


In [None]:
df= spark.createDataFrame(data, columns)
df.show()

+-------+---+---------+
|   Name|Age|     City|
+-------+---+---------+
|Deepesh| 25|   Kanpur|
|  Palak| 26|    Delhi|
| Ananya| 24|    Noida|
| Aditya| 23|Bengaluru|
+-------+---+---------+



In [None]:
df.select("Name", "Age").show()

+-------+---+
|   Name|Age|
+-------+---+
|Deepesh| 25|
|  Palak| 26|
| Ananya| 24|
| Aditya| 23|
+-------+---+



In [None]:
df.filter(df.Age > 24).show()

+-------+---+------+
|   Name|Age|  City|
+-------+---+------+
|Deepesh| 25|Kanpur|
|  Palak| 26| Delhi|
+-------+---+------+



In [None]:
df_new = df.withColumn("Age_After_5years", df.Age + 5)
df_new.show()

+-------+---+---------+----------------+
|   Name|Age|     City|Age_After_5years|
+-------+---+---------+----------------+
|Deepesh| 25|   Kanpur|              30|
|  Palak| 26|    Delhi|              31|
| Ananya| 24|    Noida|              29|
| Aditya| 23|Bengaluru|              28|
+-------+---+---------+----------------+



In [None]:
df_renamed = df.withColumnRenamed("City", "Location")
df_renamed.show()

+-------+---+---------+
|   Name|Age| Location|
+-------+---+---------+
|Deepesh| 25|   Kanpur|
|  Palak| 26|    Delhi|
| Ananya| 24|    Noida|
| Aditya| 23|Bengaluru|
+-------+---+---------+



In [None]:
df_dropped= df.drop("city")
df_dropped.show()

+-------+---+
|   Name|Age|
+-------+---+
|Deepesh| 25|
|  Palak| 26|
| Ananya| 24|
| Aditya| 23|
+-------+---+



In [None]:
df.show()

+-------+---+---------+
|   Name|Age|     City|
+-------+---+---------+
|Deepesh| 25|   Kanpur|
|  Palak| 26|    Delhi|
| Ananya| 24|    Noida|
| Aditya| 23|Bengaluru|
+-------+---+---------+



In [None]:
df.orderBy(F.desc("Age")).show()

+-------+---+---------+
|   Name|Age|     City|
+-------+---+---------+
|  Palak| 26|    Delhi|
|Deepesh| 25|   Kanpur|
| Ananya| 24|    Noida|
| Aditya| 23|Bengaluru|
+-------+---+---------+



In [None]:
df_cond = df.withColumn('condition', F.when(df.Age> 24, "Senior").otherwise("Junior")).show()

+-------+---+---------+---------+
|   Name|Age|     City|condition|
+-------+---+---------+---------+
|Deepesh| 25|   Kanpur|   Senior|
|  Palak| 26|    Delhi|   Senior|
| Ananya| 24|    Noida|   Junior|
| Aditya| 23|Bengaluru|   Junior|
+-------+---+---------+---------+



In [None]:
df.show()

+-------+---+---------+
|   Name|Age|     City|
+-------+---+---------+
|Deepesh| 25|   Kanpur|
|  Palak| 26|    Delhi|
| Ananya| 24|    Noida|
| Aditya| 23|Bengaluru|
+-------+---+---------+



In [None]:
df.describe().show()

+-------+------+------------------+---------+
|summary|  Name|               Age|     City|
+-------+------+------------------+---------+
|  count|     4|                 4|        4|
|   mean|  NULL|              24.5|     NULL|
| stddev|  NULL|1.2909944487358056|     NULL|
|    min|Aditya|                23|Bengaluru|
|    max| Palak|                26|    Noida|
+-------+------+------------------+---------+



In [None]:
df.summary().show()

+-------+------+------------------+---------+
|summary|  Name|               Age|     City|
+-------+------+------------------+---------+
|  count|     4|                 4|        4|
|   mean|  NULL|              24.5|     NULL|
| stddev|  NULL|1.2909944487358056|     NULL|
|    min|Aditya|                23|Bengaluru|
|    25%|  NULL|                23|     NULL|
|    50%|  NULL|                24|     NULL|
|    75%|  NULL|                25|     NULL|
|    max| Palak|                26|    Noida|
+-------+------+------------------+---------+



In [None]:
group_data=[("IT",40000),("HR",25000),("IT",45000),("Finance",30000)]
df_group=spark.createDataFrame(group_data,["Department","Salary"])
df_group.show()

+----------+------+
|Department|Salary|
+----------+------+
|        IT| 40000|
|        HR| 25000|
|        IT| 45000|
|   Finance| 30000|
+----------+------+



In [None]:
df_group.groupBy("Department").agg(
    F.sum("Salary").alias("Total_Salary"),
    F.max("Salary").alias("Max_Salary"),
    F.min("Salary").alias("Min_Salary"),
    F.avg("Salary").alias("Average_Salary"),
).show()

+----------+------------+----------+----------+--------------+
|Department|Total_Salary|Max_Salary|Min_Salary|Average_Salary|
+----------+------------+----------+----------+--------------+
|        HR|       25000|     25000|     25000|       25000.0|
|        IT|       85000|     45000|     40000|       42500.0|
|   Finance|       30000|     30000|     30000|       30000.0|
+----------+------------+----------+----------+--------------+



In [None]:
df_group.drop_duplicates().show()

+----------+------+
|Department|Salary|
+----------+------+
|        IT| 40000|
|        HR| 25000|
|        IT| 45000|
|   Finance| 30000|
+----------+------+



In [None]:
df.show()

+-------+---+---------+
|   Name|Age|     City|
+-------+---+---------+
|Deepesh| 25|   Kanpur|
|  Palak| 26|    Delhi|
| Ananya| 24|    Noida|
| Aditya| 23|Bengaluru|
+-------+---+---------+



In [36]:
dept_data =[("Deepesh", "IT"), ("Palak", "HR"), ("Ananya", "IT"), ("Aditya", "Finance")]
df_group=spark.createDataFrame(dept_data,["Employee", "Department"])
df_group.show()

+--------+----------+
|Employee|Department|
+--------+----------+
| Deepesh|        IT|
|   Palak|        HR|
|  Ananya|        IT|
|  Aditya|   Finance|
+--------+----------+



In [39]:
dept_data = [("Deepesh", "IT"), ("Palak", "HR"), ("Ananya", "IT"), ("Aditya", "Finance")]
df_dept = spark.createDataFrame(dept_data, ["Name", "Department"])

joined = df.join(df_dept, on="Name", how="inner")
joined.show()

+-------+---+---------+----------+
|   Name|Age|     City|Department|
+-------+---+---------+----------+
| Aditya| 23|Bengaluru|   Finance|
| Ananya| 24|    Noida|        IT|
|Deepesh| 25|   Kanpur|        IT|
|  Palak| 26|    Delhi|        HR|
+-------+---+---------+----------+



In [40]:
null_data = [("Ravi", None), ("Priya", 30), ("Ankit", None)]
df_null = spark.createDataFrame(null_data, ["Name", "Age"])

df_null.fillna({"Age": 25}).show()   # Replace nulls
df_null.na.drop().show()

+-----+---+
| Name|Age|
+-----+---+
| Ravi| 25|
|Priya| 30|
|Ankit| 25|
+-----+---+

+-----+---+
| Name|Age|
+-----+---+
|Priya| 30|
+-----+---+



In [41]:
print("Repartition to 4 partitions:")
df_repart = df.repartition(4)
print(df_repart.rdd.getNumPartitions())

print("Coalesce back to 2 partitions:")
df_coalesce = df_repart.coalesce(2)
print(df_coalesce.rdd.getNumPartitions())

Repartition to 4 partitions:
4
Coalesce back to 2 partitions:
2
