In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField,StructType,IntegerType,StringType
from pyspark.sql.functions import avg,count,max

spark = SparkSession.builder.appName("ComplexDataFrameExample").getOrCreate()
data = [
    (1, 'Alice', ['HR', 'Recruitment']),
    (2, 'Bob', ['Engineering', 'Software']),
    (3, 'Catherine', ['HR', 'Training'])
]
df_complex = spark.createDataFrame(data)
df_complex.show()


+---+---------+--------------------+
| _1|       _2|                  _3|
+---+---------+--------------------+
|  1|    Alice|   [HR, Recruitment]|
|  2|      Bob|[Engineering, Sof...|
|  3|Catherine|      [HR, Training]|
+---+---------+--------------------+



In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField,StructType,IntegerType,StringType

data = [
    (1, 'Alice', 'HR', 70000),
    (2, 'Bob', 'Engineering', 80000),
    (3, 'Catherine', 'HR', 75000),
    (4, 'David', 'Engineering', 95000),
    (5, 'Eva', 'Marketing', 60000)
]
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", IntegerType(), True)
])
df_salaries = spark.createDataFrame(data, schema)
df_salaries.show(2)

+---+-----+-----------+------+
| id| name| department|salary|
+---+-----+-----------+------+
|  1|Alice|         HR| 70000|
|  2|  Bob|Engineering| 80000|
+---+-----+-----------+------+
only showing top 2 rows



In [16]:
df_salaries.groupBy("department").agg(
    avg("salary").alias("average_salary"),
    count("id").alias("employee_count"),
    max("salary").alias("max_salary")
).show()



+-----------+--------------+--------------+----------+
| department|average_salary|employee_count|max_salary|
+-----------+--------------+--------------+----------+
|         HR|       72500.0|             2|     75000|
|Engineering|       87500.0|             2|     95000|
|  Marketing|       60000.0|             1|     60000|
+-----------+--------------+--------------+----------+



In [21]:
data = [
    (1, 'Alice', 70000),
    (2, None, 80000),
    (3, 'Catherine', None),
    (4, 'David', 95000)
]
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("salary", IntegerType(), True)
])
df_missing = spark.createDataFrame(data, schema)
df_filled = df_missing.fillna({'name': 'Unknown', 'salary': 0})
df_dropped = df_missing.dropna()
df_filled.show()
df_dropped.show()

+---+---------+------+
| id|     name|salary|
+---+---------+------+
|  1|    Alice| 70000|
|  2|  Unknown| 80000|
|  3|Catherine|     0|
|  4|    David| 95000|
+---+---------+------+

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Alice| 70000|
|  4|David| 95000|
+---+-----+------+



In [23]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
result = df.collect()
print(result) 

[Row(id=1, name='Alice'), Row(id=2, name='Bob')]


In [25]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
print(df.count()) 

2


In [27]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
print(df.first())

Row(id=1, name='Alice')


In [31]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
print(df.head(2))

[Row(id=1, name='Alice'), Row(id=2, name='Bob')]


In [32]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
print(df.take(1)) 


[Row(id=1, name='Alice')]


In [33]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
print(df.show(1))

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
+---+-----+
only showing top 1 row

None


In [34]:
df = spark.createDataFrame([(1, 100), (2, 200)], ["id", "salary"])
df.describe()

DataFrame[summary: string, id: string, salary: string]

In [35]:
df = spark.createDataFrame([(1, "Alice", 100), (2, "Bob", 200)], ["id", "name", "salary"])
df.select("name", "salary").show()

+-----+------+
| name|salary|
+-----+------+
|Alice|   100|
|  Bob|   200|
+-----+------+



In [38]:
from pyspark.sql.functions import sum, avg
df = spark.createDataFrame([(1, 100), (2, 200)], ["id", "salary"])
df.agg(sum("salary").alias("total_salary"), avg("salary").alias("average_salary")).show()


+------------+--------------+
|total_salary|average_salary|
+------------+--------------+
|         300|         150.0|
+------------+--------------+



In [50]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df.rdd.foreach(lambda row : print(row))

In [53]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df.foreachPartition(lambda partition: print(list(partition)))


In [54]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
pandas_df = df.toPandas()
print(pandas_df)

   id   name
0   1  Alice
1   2    Bob


In [55]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df.cache()
df.show()

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+



In [56]:
df = spark.createDataFrame([(1, "Alice"), (2, "Alice"), (3, "Bob")], ["id", "name"])
df.dropDuplicates(["name"]).show()

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  3|  Bob|
+---+-----+

