In [9]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col, lit, when, desc, asc, cast, like
from pyspark.sql.types import *

spark = SparkSession.Builder().appName('col functions').getOrCreate()

In [3]:
data = [
    ('Ajay', 23, 3000),
    ('Rohit', 27, 2000),
    ('Hema', 26, 2000),
    ('Huedsad', 26, 1233),
]

schema = ['name', 'age', 'salary']

df = spark.createDataFrame(data, schema)
df.show()

                                                                                

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|   Ajay| 23|  3000|
|  Rohit| 27|  2000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
+-------+---+------+



In [5]:
df.select(df.name.alias('emp_name'), df.age, df.salary.alias('emp_salary')).show()

+--------+---+----------+
|emp_name|age|emp_salary|
+--------+---+----------+
|    Ajay| 23|      3000|
|   Rohit| 27|      2000|
|    Hema| 26|      2000|
| Huedsad| 26|      1233|
+--------+---+----------+



In [7]:
df.sort(df.salary.asc()).show()

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|Huedsad| 26|  1233|
|   Hema| 26|  2000|
|  Rohit| 27|  2000|
|   Ajay| 23|  3000|
+-------+---+------+



In [8]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: long (nullable = true)



In [19]:
df1 = df.select('name', 'age',df.salary.cast(IntegerType()))
df1.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: integer (nullable = true)



In [27]:
df.filter(df.name.like('%a%')).show()

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|   Ajay| 23|  3000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
+-------+---+------+



In [39]:
df.filter('name = "Rohit"').show()
df.filter(df.name == 'Rohit').show()
df.where('name = "Rohit"').show()
df.where(df.name == 'Rohit').show()
# And
df.where((df.name == 'Rohit') & (df.name == 'Ajay')).show()
# OR
df.where((df.name == 'Rohit') | (df.name == 'Ajay')).show()

+-----+---+------+
| name|age|salary|
+-----+---+------+
|Rohit| 27|  2000|
+-----+---+------+

+-----+---+------+
| name|age|salary|
+-----+---+------+
|Rohit| 27|  2000|
+-----+---+------+

+-----+---+------+
| name|age|salary|
+-----+---+------+
|Rohit| 27|  2000|
+-----+---+------+

+-----+---+------+
| name|age|salary|
+-----+---+------+
|Rohit| 27|  2000|
+-----+---+------+



                                                                                

+----+---+------+
|name|age|salary|
+----+---+------+
+----+---+------+

+-----+---+------+
| name|age|salary|
+-----+---+------+
| Ajay| 23|  3000|
|Rohit| 27|  2000|
+-----+---+------+



In [41]:
df.distinct().show()



+-------+---+------+
|   name|age|salary|
+-------+---+------+
|   Ajay| 23|  3000|
|  Rohit| 27|  2000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
+-------+---+------+



                                                                                

In [46]:
%%time
df.dropDuplicates(['salary']).show()

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|Huedsad| 26|  1233|
|  Rohit| 27|  2000|
|   Ajay| 23|  3000|
+-------+---+------+

CPU times: user 6.13 ms, sys: 2.48 ms, total: 8.61 ms
Wall time: 639 ms


In [51]:
df.sort('name', 'salary').show()
df.orderBy('name', 'salary').show()
df.sort(df.name.desc(), df.salary.desc()).show()
df.orderBy(df.name.desc(), df.salary.desc()).show()

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|   Ajay| 23|  3000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
|  Rohit| 27|  2000|
+-------+---+------+

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|   Ajay| 23|  3000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
|  Rohit| 27|  2000|
+-------+---+------+

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|  Rohit| 27|  2000|
|Huedsad| 26|  1233|
|   Hema| 26|  2000|
|   Ajay| 23|  3000|
+-------+---+------+

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|  Rohit| 27|  2000|
|Huedsad| 26|  1233|
|   Hema| 26|  2000|
|   Ajay| 23|  3000|
+-------+---+------+



In [52]:
spark.stop()