# PySpark `orderBy()` & `sort()` - 
                By Aishwarya Raut

sort() or orderBy() function of PySpark DataFrame to sort DataFrame by ascending or descending order based on single or multiple columns.

Note that pyspark.sql.DataFrame.orderBy() is an alias for .sort()

In [2]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName("PySpark").getOrCreate()


# read csv file
file_path = "C:\\Users\\pcc\\Desktop\\daily-website-visitors.csv"
df=spark.read.csv(file_path,header=True,inferSchema=True)

df=df.withColumnsRenamed({"Day.Of.Week":"Day_Of_Week","Page.Loads":"Page_Loads",
                          "Unique.Visits":"Unique_Visits","First.Time.Visits":"First_Time_Visits",
                          "Returning.Visits":"Returning_Visits"})

df.show(5)

+---+---------+-----------+---------+----------+-------------+-----------------+----------------+
|Row|      Day|Day_Of_Week|     Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+---+---------+-----------+---------+----------+-------------+-----------------+----------------+
|  1|   Sunday|          1|9/14/2014|      2146|         1582|             1430|             152|
|  2|   Monday|          2|9/15/2014|      3621|         2528|             2297|             231|
|  3|  Tuesday|          3|9/16/2014|      3698|         2630|             2352|             278|
|  4|Wednesday|          4|9/17/2014|      3667|         2614|             2327|             287|
|  5| Thursday|          5|9/18/2014|      3316|         2366|             2130|             236|
+---+---------+-----------+---------+----------+-------------+-----------------+----------------+
only showing top 5 rows



# 1.DataFrame sorting using sort() function

In [3]:
df.sort("Day","Date").show()

+----+------+-----------+----------+----------+-------------+-----------------+----------------+
| Row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+----+------+-----------+----------+----------+-------------+-----------------+----------------+
| 475|Friday|          6|01-01-2016|      1411|          960|              786|             174|
| 111|Friday|          6|01-02-2015|      1948|         1288|             1030|             258|
|1938|Friday|          6|01-03-2020|      2970|         2180|             1859|             321|
|1574|Friday|          6|01-04-2019|      3504|         2373|             1936|             437|
|1210|Friday|          6|01-05-2018|      3511|         2272|             1801|             471|
| 846|Friday|          6|01-06-2017|      3261|         2338|             1812|             526|
| 482|Friday|          6|01-08-2016|      3223|         2286|             1860|             426|
| 118|Friday|          6|01-09

In [6]:
from pyspark.sql.functions import col 
df.sort(col("First_Time_Visits")).show(truncate=False)

+----+--------+-----------+----------+----------+-------------+-----------------+----------------+
|Row |Day     |Day_Of_Week|Date      |Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+----+--------+-----------+----------+----------+-------------+-----------------+----------------+
|103 |Thursday|5          |12/25/2014|1002      |667          |522              |145             |
|468 |Friday  |6          |12/25/2015|1017      |724          |586              |138             |
|833 |Saturday|7          |12/24/2016|1115      |825          |634              |191             |
|840 |Saturday|7          |12/31/2016|1188      |836          |643              |193             |
|1064|Saturday|7          |08-12-2017|1240      |888          |669              |219             |
|1022|Saturday|7          |07-01-2017|1277      |903          |675              |228             |
|1057|Saturday|7          |08-05-2017|1225      |901          |681              |220             |
|1036|Satu

# 2.DataFrame sorting using orderBy() function


In [7]:
# Sorting DataFrame using orderBy()
df.orderBy("Day","Date").show(truncate=False)

+----+------+-----------+----------+----------+-------------+-----------------+----------------+
|Row |Day   |Day_Of_Week|Date      |Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+----+------+-----------+----------+----------+-------------+-----------------+----------------+
|475 |Friday|6          |01-01-2016|1411      |960          |786              |174             |
|111 |Friday|6          |01-02-2015|1948      |1288         |1030             |258             |
|1938|Friday|6          |01-03-2020|2970      |2180         |1859             |321             |
|1574|Friday|6          |01-04-2019|3504      |2373         |1936             |437             |
|1210|Friday|6          |01-05-2018|3511      |2272         |1801             |471             |
|846 |Friday|6          |01-06-2017|3261      |2338         |1812             |526             |
|482 |Friday|6          |01-08-2016|3223      |2286         |1860             |426             |
|118 |Friday|6          |01-09

In [8]:
df.orderBy(col("Returning_Visits")).show(5)

+---+--------+-----------+----------+----------+-------------+-----------------+----------------+
|Row|     Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+---+--------+-----------+----------+----------+-------------+-----------------+----------------+
|  7|Saturday|          7| 9/20/2014|      1658|         1118|              985|             133|
|468|  Friday|          6|12/25/2015|      1017|          724|              586|             138|
| 14|Saturday|          7| 9/27/2014|      1656|         1180|             1040|             140|
|103|Thursday|          5|12/25/2014|      1002|          667|              522|             145|
|112|Saturday|          7|01-03-2015|      1742|         1096|              946|             150|
+---+--------+-----------+----------+----------+-------------+-----------------+----------------+
only showing top 5 rows



# 3.Sort by Ascending (ASC)

In [9]:
df.sort(df.Day_Of_Week.asc(),df.Day.asc()).show()

+---+------+-----------+----------+----------+-------------+-----------------+----------------+
|Row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+---+------+-----------+----------+----------+-------------+-----------------+----------------+
|127|Sunday|          1| 1/18/2015|      2491|         1762|             1541|             221|
|267|Sunday|          1|06-07-2015|      3132|         2145|             1790|             355|
|134|Sunday|          1| 1/25/2015|      2802|         2010|             1740|             270|
|120|Sunday|          1|01-11-2015|      2245|         1611|             1391|             220|
|141|Sunday|          1|02-01-2015|      3050|         2122|             1825|             297|
| 15|Sunday|          1| 9/28/2014|      2465|         1806|             1613|             193|
|148|Sunday|          1|02-08-2015|      3314|         2435|             2100|             335|
| 29|Sunday|          1|10-12-2014|     

In [10]:
df.sort(col("Page_Loads").asc()).show(5,truncate=False)

+----+--------+-----------+----------+----------+-------------+-----------------+----------------+
|Row |Day     |Day_Of_Week|Date      |Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+----+--------+-----------+----------+----------+-------------+-----------------+----------------+
|103 |Thursday|5          |12/25/2014|1002      |667          |522              |145             |
|468 |Friday  |6          |12/25/2015|1017      |724          |586              |138             |
|833 |Saturday|7          |12/24/2016|1115      |825          |634              |191             |
|840 |Saturday|7          |12/31/2016|1188      |836          |643              |193             |
|1057|Saturday|7          |08-05-2017|1225      |901          |681              |220             |
+----+--------+-----------+----------+----------+-------------+-----------------+----------------+
only showing top 5 rows



In [11]:
df.orderBy(col("Date").asc(),col("Page_Loads").asc()).show(5,truncate=False)


+----+--------+-----------+----------+----------+-------------+-----------------+----------------+
|Row |Day     |Day_Of_Week|Date      |Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+----+--------+-----------+----------+----------+-------------+-----------------+----------------+
|110 |Thursday|5          |01-01-2015|1265      |876          |715              |161             |
|475 |Friday  |6          |01-01-2016|1411      |960          |786              |174             |
|841 |Sunday  |1          |01-01-2017|1447      |1039         |832              |207             |
|1206|Monday  |2          |01-01-2018|1709      |1120         |878              |242             |
|1571|Tuesday |3          |01-01-2019|1715      |1168         |891              |277             |
+----+--------+-----------+----------+----------+-------------+-----------------+----------------+
only showing top 5 rows



# 4.Sort by Descending (DESC)


In [12]:
df.sort(df.Returning_Visits.asc(),df.Day.desc()).show(truncate=False)

+----+---------+-----------+----------+----------+-------------+-----------------+----------------+
|Row |Day      |Day_Of_Week|Date      |Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+----+---------+-----------+----------+----------+-------------+-----------------+----------------+
|7   |Saturday |7          |9/20/2014 |1658      |1118         |985              |133             |
|468 |Friday   |6          |12/25/2015|1017      |724          |586              |138             |
|14  |Saturday |7          |9/27/2014 |1656      |1180         |1040             |140             |
|103 |Thursday |5          |12/25/2014|1002      |667          |522              |145             |
|112 |Saturday |7          |01-03-2015|1742      |1096         |946              |150             |
|1   |Sunday   |1          |9/14/2014 |2146      |1582         |1430             |152             |
|1925|Saturday |7          |12/21/2019|1682      |1177         |1020             |157             |


In [13]:
df.sort(col("Returning_Visits").asc(),col("Day").desc()).show(truncate=False)

+----+---------+-----------+----------+----------+-------------+-----------------+----------------+
|Row |Day      |Day_Of_Week|Date      |Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+----+---------+-----------+----------+----------+-------------+-----------------+----------------+
|7   |Saturday |7          |9/20/2014 |1658      |1118         |985              |133             |
|468 |Friday   |6          |12/25/2015|1017      |724          |586              |138             |
|14  |Saturday |7          |9/27/2014 |1656      |1180         |1040             |140             |
|103 |Thursday |5          |12/25/2014|1002      |667          |522              |145             |
|112 |Saturday |7          |01-03-2015|1742      |1096         |946              |150             |
|1   |Sunday   |1          |9/14/2014 |2146      |1582         |1430             |152             |
|1925|Saturday |7          |12/21/2019|1682      |1177         |1020             |157             |


In [14]:
df.orderBy(col("Returning_Visits").asc(),col("Day").desc()).show(truncate=False)

+----+---------+-----------+----------+----------+-------------+-----------------+----------------+
|Row |Day      |Day_Of_Week|Date      |Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+----+---------+-----------+----------+----------+-------------+-----------------+----------------+
|7   |Saturday |7          |9/20/2014 |1658      |1118         |985              |133             |
|468 |Friday   |6          |12/25/2015|1017      |724          |586              |138             |
|14  |Saturday |7          |9/27/2014 |1656      |1180         |1040             |140             |
|103 |Thursday |5          |12/25/2014|1002      |667          |522              |145             |
|112 |Saturday |7          |01-03-2015|1742      |1096         |946              |150             |
|1   |Sunday   |1          |9/14/2014 |2146      |1582         |1430             |152             |
|1925|Saturday |7          |12/21/2019|1682      |1177         |1020             |157             |


# 5. Using Raw SQL

In [15]:
df.createTempView("Daily_Data")
spark.sql("select * from Daily_Data order by Day asc").show()

+---+------+-----------+----------+----------+-------------+-----------------+----------------+
|Row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+---+------+-----------+----------+----------+-------------+-----------------+----------------+
| 97|Friday|          6|12/19/2014|      2748|         1879|             1559|             320|
|237|Friday|          6|05-08-2015|      4696|         3366|             2824|             542|
|104|Friday|          6|12/26/2014|      1486|         1005|              808|             197|
|  6|Friday|          6| 9/19/2014|      2815|         1863|             1622|             241|
|111|Friday|          6|01-02-2015|      1948|         1288|             1030|             258|
| 20|Friday|          6|10-03-2014|      3005|         2097|             1856|             241|
|118|Friday|          6|01-09-2015|      2783|         1941|             1663|             278|
| 34|Friday|          6|10/17/2014|     