In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,IntegerType,StringType
spark=SparkSession.builder.appName("DataFram0ps").getOrCreate()

data=[
    (1,"Alice",25),
    (2,"Bob",22),
    (3,"Charlie",35)
]
schema = StructType([
    StructField("id",IntegerType(),False),
    StructField("name",StringType(),False),
    StructField("age",IntegerType(),False)
])
     
df = spark.createDataFrame(data,schema)
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 22|
|  3|Charlie| 35|
+---+-------+---+



In [26]:
 df.printSchema()

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)



In [27]:
df.columns

['id', 'name', 'age']

In [28]:
df.describe().show()

+-------+---+-------+------------------+
|summary| id|   name|               age|
+-------+---+-------+------------------+
|  count|  3|      3|                 3|
|   mean|2.0|   NULL|27.333333333333332|
| stddev|1.0|   NULL|6.8068592855540455|
|    min|  1|  Alice|                22|
|    max|  3|Charlie|                35|
+-------+---+-------+------------------+



## select and filtering data use cases


In [29]:
df.select('name','age').show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 22|
|Charlie| 35|
+-------+---+



In [30]:
df.filter(df.age>25).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
+---+-------+---+



In [31]:
df.where(df.name=='Alice').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
+---+-----+---+



In [32]:
df.distinct().show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  3|Charlie| 35|
|  2|    Bob| 22|
+---+-------+---+



## Sorting and Ordering

In [33]:
df.orderBy('age').show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|    Bob| 22|
|  1|  Alice| 25|
|  3|Charlie| 35|
+---+-------+---+



In [34]:
df.orderBy(df.age.desc()).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  1|  Alice| 25|
|  2|    Bob| 22|
+---+-------+---+



## Adiing and Droping Columns

In [35]:
df.withColumn('new_age',df.age+5).show()   ## to save this result write df= ...

+---+-------+---+-------+
| id|   name|age|new_age|
+---+-------+---+-------+
|  1|  Alice| 25|     30|
|  2|    Bob| 22|     27|
|  3|Charlie| 35|     40|
+---+-------+---+-------+



In [36]:
df.drop('age').show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+



In [37]:
df = spark.createDataFrame(data,schema)
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 22|
|  3|Charlie| 35|
+---+-------+---+



In [39]:
## Aggregation and  grouping

In [40]:
df.groupBy('name').count().show()

+-------+-----+
|   name|count|
+-------+-----+
|  Alice|    1|
|Charlie|    1|
|    Bob|    1|
+-------+-----+



In [41]:
df.agg({'age':'avg'}).show()

+------------------+
|          avg(age)|
+------------------+
|27.333333333333332|
+------------------+



In [42]:
## Joins

In [43]:
data2=[(1,'usa'),(2,'uk'),(3,'india')]
schema2=StructType(
    [StructField('id',IntegerType(),True),
     StructField('country',StringType(),True)]
)

In [44]:
df2=spark.createDataFrame(data2,schema2)

In [45]:
df2.show()

+---+-------+
| id|country|
+---+-------+
|  1|    usa|
|  2|     uk|
|  3|  india|
+---+-------+



In [46]:
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 22|
|  3|Charlie| 35|
+---+-------+---+



In [49]:
final_df=df.join(df2,'id').show()   ## when you write df.show it show the output and return none and if i write final_df.show() it will show me error 

+---+-------+---+-------+
| id|   name|age|country|
+---+-------+---+-------+
|  1|  Alice| 25|    usa|
|  2|    Bob| 22|     uk|
|  3|Charlie| 35|  india|
+---+-------+---+-------+



In [1]:
spark.stop()