# Create `emp` table

In [0]:
from pyspark.sql.functions import when, col

# Read emp csv file
emp = spark.read.csv("/FileStore/tables/emp.csv", inferSchema=True, header=True)

# Replace 'NULL' strings with actual nulls
for column in emp.columns:
    emp = emp.withColumn(column, when(col(column)=="NULL", None).otherwise(col(column)))

# Create Temporary view for emp
emp.createOrReplaceTempView("emp")

# Create `dept` table

In [0]:
from pyspark.sql.functions import when, col

# Read emp csv file
dept = spark.read.csv("/FileStore/tables/dept.csv", inferSchema=True, header=True)

# Replace 'NULL' strings with actual nulls
for column in dept.columns:
    dept = dept.withColumn(column, when(col(column)=="NULL", None).otherwise(col(column)))

# Create Temporary view for emp
dept.createOrReplaceTempView("dept")

# Queries

- Customizing how query looks. By understanding how to control how result set is organized , more readable and meaningful data can be inferred

## 1. Returning Query Results in Specific Order

### A. `SQL`
- By default, `ORDER BY` uses ascending sort. To sort by descending, use `desc` keyword

In [0]:
%sql
select ename, job, sal
from emp
where deptno=10
order by ename

ename,job,sal
CLARK,MANAGER,4000
KING,PRESIDENT,4000
MILLER,CLERK,4000


In [0]:
%sql
select ename, job, sal
from emp
where deptno=10
order by ename desc

ename,job,sal
MILLER,CLERK,4000
KING,PRESIDENT,4000
CLARK,MANAGER,4000


### B. `PySpark`

In [0]:
from pyspark.sql.functions import col
display(emp.select('ename', 'job', 'sal').filter(emp.deptno=='10').orderBy(col('ename')))

ename,job,sal
CLARK,MANAGER,4000
KING,PRESIDENT,4000
MILLER,CLERK,4000


In [0]:
from pyspark.sql.functions import col
display(emp.select('ename', 'job', 'sal').filter(emp.deptno=='10').orderBy(col('ename').desc()))

ename,job,sal
MILLER,CLERK,4000
KING,PRESIDENT,4000
CLARK,MANAGER,4000


### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()

df_emp_pandas['deptno'] = df_emp_pandas[df_emp_pandas['deptno'].notna()]['deptno'].astype('int64')

display(df_emp_pandas[df_emp_pandas['deptno']==10][['ename', 'job', 'sal']].sort_values(by='ename', ascending=False))

ename,job,sal
MILLER,CLERK,4000
KING,PRESIDENT,4000
CLARK,MANAGER,4000


## 2. Sorting by Multiple Fields

### A. `SQL`
- Order of precedence in `ORDER BY` is from left to right
- Generally permitted to order by column not in `SELECT` list by explicitly naming the column but not possible if `GROUP BY` or `DISTINCT` is used

In [0]:
%sql
select empno, deptno
from emp
where deptno=10
order by empno, sal desc

empno,deptno
7782,10
7839,10
7934,10


### B. `PySpark`

In [0]:
from pyspark.sql.functions import col
display(emp.select('empno', 'deptno').filter(col('deptno')==10).orderBy(col('empno').desc(), col('sal').desc()))

empno,deptno
7934,10
7839,10
7782,10


### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()

df_emp_pandas['deptno'] = df_emp_pandas[df_emp_pandas['deptno'].notna()]['deptno'].astype('int64')

display(df_emp_pandas[df_emp_pandas['deptno']==10].sort_values(by=['empno', 'sal'], ascending=[False, False])[['empno', 'deptno']])

empno,deptno
7934,10.0
7839,10.0
7782,10.0


## 3. Sorting by Substrings

### A. `SQL`
- Sorting by last 2 characters in string
- psql uses `SUBSTR` function to get a substring where first argument is the column and second is the start of string

In [0]:
%sql
select ename, job
from emp
order by substr(job, length(job)-1)

ename,job
ALLEN,SALESMAN
WARD,SALESMAN
MARTIN,SALESMAN
TURNER,SALESMAN
YODA,JEDI
BLAKE,MANAGER
JONES,MANAGER
CLARK,MANAGER
KING,PRESIDENT
JAMES,CLERK


In [0]:
%sql
select ename, job
from emp
order by substr(job, -2)

ename,job
ALLEN,SALESMAN
WARD,SALESMAN
MARTIN,SALESMAN
TURNER,SALESMAN
YODA,JEDI
BLAKE,MANAGER
JONES,MANAGER
CLARK,MANAGER
KING,PRESIDENT
JAMES,CLERK


### B. `PySpark`
- Use negative indexes to start from last

In [0]:
from pyspark.sql.functions import col
display(emp.select('ename', 'job').orderBy(col('job').substr(-2, 2)))

ename,job
ALLEN,SALESMAN
WARD,SALESMAN
MARTIN,SALESMAN
TURNER,SALESMAN
YODA,JEDI
BLAKE,MANAGER
JONES,MANAGER
CLARK,MANAGER
KING,PRESIDENT
JAMES,CLERK


### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()

df_emp_pandas['sort_job'] = df_emp_pandas['job'].astype('str').str[-2:]

display(df_emp_pandas.sort_values(by='sort_job')[['ename', 'job']])

ename,job
ALLEN,SALESMAN
WARD,SALESMAN
MARTIN,SALESMAN
TURNER,SALESMAN
YODA,JEDI
BLAKE,MANAGER
JONES,MANAGER
CLARK,MANAGER
KING,PRESIDENT
JAMES,CLERK


## 4. Sorting Mixed Alphanumeric Data

### A. `SQL`

#### Creating a View

In [0]:
%sql
create temp view V
as
select ename||' '||deptno as data
from emp;

select * from V

data
ALLEN 30
WARD 30
MARTIN 30
BLAKE 30
TURNER 30
JAMES 30
""
""
SMITH 20
JONES 20


#### `TRANSLATE(string, from, to)` function
1) **string**
is a string subjected to translation.

2) **from**
is a set of characters in the first argument (string) that should be replaced.

3) **to**
is a set of characters that replaces the from in the string.

Notice that if from is longer than to, the TRANSLATE() function removes the occurrences of the extra characters in from.

#### `REPLACE(source, old_text, new_text)` function
1) **source** is a string where you want to replace.
2) **old_text** is the text that you want to search and replace. If the old_text appears multiple times in the string, all of its occurrences will be replaced.
3) **new_text** is the new text that will replace the old text (**old_text**).

> `TRANSLATE` replaces characters while `REPLACE` replaces the whole text

#### Order By Deptno

In [0]:
%sql
select data
from V
order by replace(data,replace(translate(data, '0123456789', '##########'),'#',''), '')

data
""
""
MILLER 10
KING 10
CLARK 10
SMITH 20
JONES 20
SCOTT 20
ADAMS 20
FORD 20


In [0]:
%sql
select data
from V
order by replace(translate(data, '0123456789', '##########'),'#','')

data
""
""
ADAMS 20
ALLEN 30
BLAKE 30
CLARK 10
FORD 20
JAMES 30
JONES 20
KING 10


### B. `PySpark`

In [0]:
import pyspark.sql.functions as F
df_v = emp.withColumn('data', F.concat(emp.ename, F.lit(' '), emp.deptno)).select(F.col('data'))
df_v = df_v.withColumn('transformed_data_1', F.translate('data', '0123456789', '##########'))
df_v = df_v.withColumn('transformed_data_2', F.regexp_replace('transformed_data_1','#',''))
df_v = df_v.withColumn('transformed_data_3', F.expr("regexp_replace(data, transformed_data_2, '')"))
display(df_v.orderBy('transformed_data_3').select('data'))

data
""
""
MILLER 10
KING 10
CLARK 10
SMITH 20
JONES 20
SCOTT 20
ADAMS 20
FORD 20


### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()

df_emp_pandas['ename'] = df_emp_pandas['ename'].astype('str')
df_emp_pandas['deptno'] = df_emp_pandas['deptno'].astype('str')

df_emp_pandas['data'] = df_emp_pandas['ename'] + ' ' + df_emp_pandas['deptno']

df_emp_pandas['transformed_data_1'] = df_emp_pandas['data'].str.replace(str(0), '#')
for i in range(1, 10):
    df_emp_pandas['transformed_data_1'] = df_emp_pandas['transformed_data_1'].str.replace(str(i), '#')


df_emp_pandas['transformed_data_2'] = df_emp_pandas['transformed_data_1'].str.replace('#', '')
df_emp_pandas['transformed_data_3'] = df_emp_pandas[['data', 'transformed_data_2']].apply(lambda x: x[0].replace(x[1], ''), axis=1)

display(df_emp_pandas.sort_values(by='transformed_data_3')[['data']])

data
YODA None
Jonathan None
MILLER 10
KING 10
CLARK 10
SMITH 20
JONES 20
SCOTT 20
ADAMS 20
FORD 20


In [0]:
display(df_emp_pandas)

empno,ename,job,mgr,hiredate,sal,comm,deptno,data,transformed_data_1,transformed_data_2,transformed_data_3
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600.0,300.0,30.0,ALLEN 30,ALLEN ##,ALLEN,30.0
7521,WARD,SALESMAN,7698.0,1981-02-22,1250.0,500.0,30.0,WARD 30,WARD ##,WARD,30.0
7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250.0,1400.0,30.0,MARTIN 30,MARTIN ##,MARTIN,30.0
7698,BLAKE,MANAGER,7839.0,1981-05-01,2850.0,,30.0,BLAKE 30,BLAKE ##,BLAKE,30.0
7844,TURNER,SALESMAN,7698.0,1981-09-08,1500.0,0.0,30.0,TURNER 30,TURNER ##,TURNER,30.0
7900,JAMES,CLERK,7698.0,1981-12-03,950.0,,30.0,JAMES 30,JAMES ##,JAMES,30.0
1111,YODA,JEDI,,1981-11-17,5000.0,,,YODA None,YODA None,YODA None,
1,Jonathan,Editor,,,,,,Jonathan None,Jonathan None,Jonathan None,
7369,SMITH,CLERK,7902.0,1980-12-17,880.0,,20.0,SMITH 20,SMITH ##,SMITH,20.0
7566,JONES,MANAGER,7839.0,1981-04-02,3273.0,,20.0,JONES 20,JONES ##,JONES,20.0


## 5. Dealing with Nulls in Sorting
- There is a little trick to use when not trying to get below results

### 1. NON-NULL COMM SORTED ASCENDING, ALL NULLS LAST

#### A. `SQL`

In [0]:
%sql
select ename, sal, comm
from (
    select ename, sal, comm, case when comm is null then 0 else 1 end as is_null
    from emp
    ) a
order by is_null desc, cast(comm as int)

ename,sal,comm
TURNER,1500.0,0.0
ALLEN,1600.0,300.0
WARD,1250.0,500.0
MARTIN,1250.0,1400.0
MILLER,4000.0,2000.0
KING,4000.0,2000.0
CLARK,4000.0,2000.0
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,


In [0]:
%sql
select ename, sal, comm
from emp
order by cast(comm as int) nulls last

ename,sal,comm
TURNER,1500.0,0.0
ALLEN,1600.0,300.0
WARD,1250.0,500.0
MARTIN,1250.0,1400.0
MILLER,4000.0,2000.0
KING,4000.0,2000.0
CLARK,4000.0,2000.0
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,


#### B. `PySpark`

In [0]:
from pyspark.sql.types import IntegerType
emp.select('ename', 'sal', 'comm').orderBy(emp.comm.cast(IntegerType()).asc_nulls_last()).display()

ename,sal,comm
TURNER,1500.0,0.0
ALLEN,1600.0,300.0
WARD,1250.0,500.0
MARTIN,1250.0,1400.0
MILLER,4000.0,2000.0
KING,4000.0,2000.0
CLARK,4000.0,2000.0
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,


#### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()
df_emp_pandas['comm'] = df_emp_pandas[df_emp_pandas['comm'].notna()]['comm'].astype('int64')

df_emp_pandas.sort_values(by='comm', na_position='last')[['ename', 'sal', 'comm']].display()

ename,sal,comm
TURNER,1500.0,0.0
ALLEN,1600.0,300.0
WARD,1250.0,500.0
MARTIN,1250.0,1400.0
MILLER,4000.0,2000.0
KING,4000.0,2000.0
CLARK,4000.0,2000.0
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,


### 2. NON-NULL COMM SORTED DESCENDING, ALL NULLS FIRST

#### A. `SQL`

In [0]:
%sql
select ename, sal, comm
from (
    select ename, sal, comm, case when comm is null then 0 else 1 end as is_null
    from emp
    ) a
order by is_null, cast(comm as int) desc

ename,sal,comm
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,
Jonathan,,
SMITH,880.0,
JONES,3273.0,
SCOTT,3300.0,
ADAMS,1210.0,
FORD,3300.0,
MILLER,4000.0,2000.0


In [0]:
%sql
select ename, sal, comm
from emp
order by cast(comm as int) desc nulls first

ename,sal,comm
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,
Jonathan,,
SMITH,880.0,
JONES,3273.0,
SCOTT,3300.0,
ADAMS,1210.0,
FORD,3300.0,
MILLER,4000.0,2000.0


#### B. `PySpark`

In [0]:
from pyspark.sql.types import IntegerType
emp.select('ename', 'sal', 'comm').orderBy(emp.comm.cast(IntegerType()).desc_nulls_first()).display()

ename,sal,comm
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,
Jonathan,,
SMITH,880.0,
JONES,3273.0,
SCOTT,3300.0,
ADAMS,1210.0,
FORD,3300.0,
MILLER,4000.0,2000.0


#### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()
df_emp_pandas['comm'] = df_emp_pandas[df_emp_pandas['comm'].notna()]['comm'].astype('int64')

df_emp_pandas.sort_values(by='comm',
                          ascending=False,
                          na_position='first')[['ename', 'sal', 'comm']].display()

ename,sal,comm
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,
Jonathan,,
SMITH,880.0,
JONES,3273.0,
SCOTT,3300.0,
ADAMS,1210.0,
FORD,3300.0,
MILLER,4000.0,2000.0


### 3. NON-NULL COMM SORTED ASCENDING, ALL NULLS FIRST

#### A. `SQL`

In [0]:
%sql
select ename, sal, comm 
from emp
order by cast(comm as int) asc

ename,sal,comm
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,
Jonathan,,
SMITH,880.0,
JONES,3273.0,
SCOTT,3300.0,
ADAMS,1210.0,
FORD,3300.0,
TURNER,1500.0,0.0


In [0]:
%sql
select ename, sal, comm 
from emp
order by cast(comm as int) nulls first

ename,sal,comm
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,
Jonathan,,
SMITH,880.0,
JONES,3273.0,
SCOTT,3300.0,
ADAMS,1210.0,
FORD,3300.0,
TURNER,1500.0,0.0


In [0]:
%sql
select ename, sal, comm
from (
    select ename, sal, comm, case when comm is null then 0 else 1 end as is_null
    from emp
    ) a
order by is_null, cast(comm as int)

ename,sal,comm
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,
Jonathan,,
SMITH,880.0,
JONES,3273.0,
SCOTT,3300.0,
ADAMS,1210.0,
FORD,3300.0,
TURNER,1500.0,0.0


#### B. `PySpark`

In [0]:
from pyspark.sql.types import IntegerType
emp.select('ename', 'sal', 'comm').orderBy(emp.comm.cast(IntegerType()).asc_nulls_first()).display()

ename,sal,comm
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,
Jonathan,,
SMITH,880.0,
JONES,3273.0,
SCOTT,3300.0,
ADAMS,1210.0,
FORD,3300.0,
TURNER,1500.0,0.0


#### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()
df_emp_pandas['comm'] = df_emp_pandas[df_emp_pandas['comm'].notna()]['comm'].astype('int64')

df_emp_pandas.sort_values(by='comm', na_position='first')[['ename', 'sal', 'comm']].display()

ename,sal,comm
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,
Jonathan,,
SMITH,880.0,
JONES,3273.0,
SCOTT,3300.0,
ADAMS,1210.0,
FORD,3300.0,
TURNER,1500.0,0.0


### 4. NON-NULL COMM SORTED DESCENDING, ALL NULLS LAST

#### A. `SQL`

In [0]:
%sql
select ename, sal, comm 
from emp
order by cast(comm as int) desc

ename,sal,comm
MILLER,4000.0,2000.0
KING,4000.0,2000.0
CLARK,4000.0,2000.0
MARTIN,1250.0,1400.0
WARD,1250.0,500.0
ALLEN,1600.0,300.0
TURNER,1500.0,0.0
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,


In [0]:
%sql
select ename, sal, comm 
from emp
order by cast(comm as int) desc nulls last

ename,sal,comm
MILLER,4000.0,2000.0
KING,4000.0,2000.0
CLARK,4000.0,2000.0
MARTIN,1250.0,1400.0
WARD,1250.0,500.0
ALLEN,1600.0,300.0
TURNER,1500.0,0.0
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,


In [0]:
%sql
select ename, sal, comm
from (
    select ename, sal, comm, case when comm is null then 0 else 1 end as is_null
    from emp
    ) a
order by is_null desc, cast(comm as int) desc

ename,sal,comm
MILLER,4000.0,2000.0
KING,4000.0,2000.0
CLARK,4000.0,2000.0
MARTIN,1250.0,1400.0
WARD,1250.0,500.0
ALLEN,1600.0,300.0
TURNER,1500.0,0.0
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,


#### B. `PySpark`

In [0]:
from pyspark.sql.types import IntegerType
emp.select('ename', 'sal', 'comm').orderBy(emp.comm.cast(IntegerType()).desc_nulls_last()).display()

ename,sal,comm
MILLER,4000.0,2000.0
KING,4000.0,2000.0
CLARK,4000.0,2000.0
MARTIN,1250.0,1400.0
WARD,1250.0,500.0
ALLEN,1600.0,300.0
TURNER,1500.0,0.0
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,


#### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()
df_emp_pandas['comm'] = df_emp_pandas[df_emp_pandas['comm'].notna()]['comm'].astype('int64')

df_emp_pandas.sort_values(by='comm',
                          ascending=False,
                          na_position='last')[['ename', 'sal', 'comm']].display()

ename,sal,comm
MILLER,4000.0,2000.0
KING,4000.0,2000.0
CLARK,4000.0,2000.0
MARTIN,1250.0,1400.0
WARD,1250.0,500.0
ALLEN,1600.0,300.0
TURNER,1500.0,0.0
BLAKE,2850.0,
JAMES,950.0,
YODA,5000.0,


## 6. Sorting on a Data-Dependent Key or Sorting on Conditional Logic

### A. `SQL`

In [0]:
%sql
select ename, sal, job , comm
from emp
order by case when job='SALESMAN' then cast(comm as int) else cast(sal as int) end

ename,sal,job,comm
Jonathan,,Editor,
TURNER,1500.0,SALESMAN,0.0
ALLEN,1600.0,SALESMAN,300.0
WARD,1250.0,SALESMAN,500.0
SMITH,880.0,CLERK,
JAMES,950.0,CLERK,
ADAMS,1210.0,CLERK,
MARTIN,1250.0,SALESMAN,1400.0
BLAKE,2850.0,MANAGER,
JONES,3273.0,MANAGER,


In [0]:
%sql
select ename, sal, job , comm, case when job='SALESMAN' then cast(comm as int) else cast(sal as int) end as ordered
from emp
order by ordered

ename,sal,job,comm,ordered
Jonathan,,Editor,,
TURNER,1500.0,SALESMAN,0.0,0.0
ALLEN,1600.0,SALESMAN,300.0,300.0
WARD,1250.0,SALESMAN,500.0,500.0
SMITH,880.0,CLERK,,880.0
JAMES,950.0,CLERK,,950.0
ADAMS,1210.0,CLERK,,1210.0
MARTIN,1250.0,SALESMAN,1400.0,1400.0
BLAKE,2850.0,MANAGER,,2850.0
JONES,3273.0,MANAGER,,3273.0


### B. `PySpark`

In [0]:
from pyspark.sql.functions import when
from pyspark.sql.types import IntegerType
emp.select('ename', 'sal', 'job', 'comm').orderBy(when(emp.job=='SALESMAN',emp.comm.cast(IntegerType())) \
                                               .otherwise(emp.sal.cast(IntegerType()))).display()

ename,sal,job,comm
Jonathan,,Editor,
TURNER,1500.0,SALESMAN,0.0
ALLEN,1600.0,SALESMAN,300.0
WARD,1250.0,SALESMAN,500.0
SMITH,880.0,CLERK,
JAMES,950.0,CLERK,
ADAMS,1210.0,CLERK,
MARTIN,1250.0,SALESMAN,1400.0
BLAKE,2850.0,MANAGER,
JONES,3273.0,MANAGER,


### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()

df_emp_pandas['sal'] = df_emp_pandas[df_emp_pandas['sal'].notna()]['sal'].astype('int64')
df_emp_pandas['comm'] = df_emp_pandas[df_emp_pandas['comm'].notna()]['comm'].astype('int64')


df_emp_pandas['ordered'] = df_emp_pandas[['sal', 'comm', 'job']].apply(lambda x: x[1] if x[2]=="SALESMAN" else x[0],
                                                                       axis=1)

df_emp_pandas.sort_values(by=['ordered'],
                          na_position='first')[['ename', 'sal', 'job', 'comm']].display()

ename,sal,job,comm
Jonathan,,Editor,
TURNER,1500.0,SALESMAN,0.0
ALLEN,1600.0,SALESMAN,300.0
WARD,1250.0,SALESMAN,500.0
SMITH,880.0,CLERK,
JAMES,950.0,CLERK,
ADAMS,1210.0,CLERK,
MARTIN,1250.0,SALESMAN,1400.0
BLAKE,2850.0,MANAGER,
JONES,3273.0,MANAGER,
