In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,IntegerType,StringType

spark=SparkSession.builder.appName("DataFrameOps").getOrCreate()

data=[
    (1,"Alice",25),
    (2,"Bob",30),
    (3,"Charlie",35)
]

schema=StructType([
    StructField("id",IntegerType(),False),
    StructField("name",StringType(),False),
    StructField("age",IntegerType(),False)
])

df=spark.createDataFrame(data,schema)
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [0]:
df.printSchema()

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)



In [0]:
df.columns

Out[3]: ['id', 'name', 'age']

In [0]:
df.describe().show()

+-------+---+-------+----+
|summary| id|   name| age|
+-------+---+-------+----+
|  count|  3|      3|   3|
|   mean|2.0|   null|30.0|
| stddev|1.0|   null| 5.0|
|    min|  1|  Alice|  25|
|    max|  3|Charlie|  35|
+-------+---+-------+----+



In [0]:
# Select and Filtering Data

df.select('name','age').show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [0]:
df.filter(df.age>25).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [0]:
df.where(df.name=='Alice').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
+---+-----+---+



In [0]:
df.distinct().show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [0]:
# Sorting and Ordering

df.orderBy(df.age).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [0]:
df.orderBy(df.age.desc()).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  2|    Bob| 30|
|  1|  Alice| 25|
+---+-------+---+



In [0]:
# Adding and Dropping Columns

df.withColumn('new_age',df.age+5).show()

+---+-------+---+-------+
| id|   name|age|new_age|
+---+-------+---+-------+
|  1|  Alice| 25|     30|
|  2|    Bob| 30|     35|
|  3|Charlie| 35|     40|
+---+-------+---+-------+



In [0]:
df.drop('age').show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+



In [0]:
df=spark.createDataFrame(data,schema)
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [0]:
# Aggregation and Grouping

df.groupBy('name').count().show()

+-------+-----+
|   name|count|
+-------+-----+
|  Alice|    1|
|    Bob|    1|
|Charlie|    1|
+-------+-----+



In [0]:
df.agg({'age':'avg'}).show()

+--------+
|avg(age)|
+--------+
|    30.0|
+--------+



In [0]:
# Joins

data2= [(1,'usa'),(2,'uk'),(3,'india')]
schema2=StructType([
    StructField('id',IntegerType(),True),
    StructField('country',StringType(),True)
]
)


In [0]:
df2=spark.createDataFrame(data2,schema2)

In [0]:
df2.show()
df.show()

+---+-------+
| id|country|
+---+-------+
|  1|    usa|
|  2|     uk|
|  3|  india|
+---+-------+

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [0]:
final_df=df.join(df2,'id')

In [0]:
final_df.show()

+---+-------+---+-------+
| id|   name|age|country|
+---+-------+---+-------+
|  1|  Alice| 25|    usa|
|  2|    Bob| 30|     uk|
|  3|Charlie| 35|  india|
+---+-------+---+-------+

