In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder.appName("DfOps").getOrCreate()
data = [
    (1,'Alice',25),
    (2,'Bob',30),
    (3,'Charlie',35)
]

schema = StructType([
    StructField('id',IntegerType(),False),
    StructField('name',StringType(),False),
    StructField('age',IntegerType(),False)
])

df = spark.createDataFrame(data,schema)

df.show()

25/12/20 20:39:57 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [3]:
df.show(2)

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
|  2|  Bob| 30|
+---+-----+---+
only showing top 2 rows



In [4]:
df.printSchema()

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)



In [6]:
df.columns

['id', 'name', 'age']

In [7]:
df.describe().show()



+-------+---+-------+----+
|summary| id|   name| age|
+-------+---+-------+----+
|  count|  3|      3|   3|
|   mean|2.0|   NULL|30.0|
| stddev|1.0|   NULL| 5.0|
|    min|  1|  Alice|  25|
|    max|  3|Charlie|  35|
+-------+---+-------+----+



                                                                                

In [8]:
df.filter(df.age>25).show()

                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [9]:
df.filter(df.name=="Alice").show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
+---+-----+---+



In [10]:
df.where(df.name=='Alice').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
+---+-----+---+



In [11]:
df.orderBy(df.age.desc()).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  2|    Bob| 30|
|  1|  Alice| 25|
+---+-------+---+



In [22]:
df = spark.createDataFrame(data,schema)

df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [12]:
 #Adding and Dropping Columns

In [23]:
df = df.withColumn('newAge',df.age+5)

In [24]:
df.drop('age').show()

+---+-------+------+
| id|   name|newAge|
+---+-------+------+
|  1|  Alice|    30|
|  2|    Bob|    35|
|  3|Charlie|    40|
+---+-------+------+



In [26]:
df.agg({'newAge':'avg'}).show()

+-----------+
|avg(newAge)|
+-----------+
|       35.0|
+-----------+



In [27]:
df.agg({'newAge':'max'}).show()

+-----------+
|max(newAge)|
+-----------+
|         40|
+-----------+



In [28]:
df.agg({'newAge':'stddev'}).show()

+--------------+
|stddev(newAge)|
+--------------+
|           5.0|
+--------------+



In [29]:
data2 = [
    (1,'usa'),
    (2,'india'),
    (3,'uk')
]

schema2 = StructType([
    StructField('id',IntegerType(),False),
    StructField('country',StringType(),False),
])

df2 = spark.createDataFrame(data2,schema2)

df2.show()

+---+-------+
| id|country|
+---+-------+
|  1|    usa|
|  2|  india|
|  3|     uk|
+---+-------+



In [31]:
#Join

In [30]:
df.join(df2,'id').show()



+---+-------+---+------+-------+
| id|   name|age|newAge|country|
+---+-------+---+------+-------+
|  1|  Alice| 25|    30|    usa|
|  2|    Bob| 30|    35|  india|
|  3|Charlie| 35|    40|     uk|
+---+-------+---+------+-------+



                                                                                

In [32]:
spark.stop()