# Create `emp` table

In [0]:
from pyspark.sql.functions import when, col

# Read emp csv file
emp = spark.read.csv("/FileStore/tables/emp.csv", inferSchema=True, header=True)

# Replace 'NULL' strings with actual nulls
for column in emp.columns:
    emp = emp.withColumn(column, when(col(column)=="NULL", None).otherwise(col(column)))

# Create Temporary view for emp
emp.createOrReplaceTempView("emp")

# Create `dept` table

In [0]:
from pyspark.sql.functions import when, col

# Read emp csv file
dept = spark.read.csv("/FileStore/tables/dept.csv", inferSchema=True, header=True)

# Replace 'NULL' strings with actual nulls
for column in dept.columns:
    dept = dept.withColumn(column, when(col(column)=="NULL", None).otherwise(col(column)))

# Create Temporary view for emp
dept.createOrReplaceTempView("dept")

# Queries

- Customizing how query looks. By understanding how to control how result set is organized , more readable and meaningful data can be inferred

## 1. Returning Query Results in Specific Order

### A. `SQL`
- By default, `ORDER BY` uses ascending sort. To sort by descending, use `desc` keyword

In [0]:
%sql
select ename, job, sal
from emp
where deptno=10
order by ename

ename,job,sal
CLARK,MANAGER,4000
KING,PRESIDENT,4000
MILLER,CLERK,4000


In [0]:
%sql
select ename, job, sal
from emp
where deptno=10
order by ename desc

ename,job,sal
MILLER,CLERK,4000
KING,PRESIDENT,4000
CLARK,MANAGER,4000


### B. `PySpark`

In [0]:
from pyspark.sql.functions import col
display(emp.select('ename', 'job', 'sal').filter(emp.deptno=='10').orderBy(col('ename')))

ename,job,sal
CLARK,MANAGER,4000
KING,PRESIDENT,4000
MILLER,CLERK,4000


In [0]:
from pyspark.sql.functions import col
display(emp.select('ename', 'job', 'sal').filter(emp.deptno=='10').orderBy(col('ename').desc()))

ename,job,sal
MILLER,CLERK,4000
KING,PRESIDENT,4000
CLARK,MANAGER,4000


### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()

df_emp_pandas['deptno'] = df_emp_pandas[df_emp_pandas['deptno'].notna()]['deptno'].astype('int64')

display(df_emp_pandas[df_emp_pandas['deptno']==10][['ename', 'job', 'sal']].sort_values(by='ename', ascending=False))

ename,job,sal
MILLER,CLERK,4000
KING,PRESIDENT,4000
CLARK,MANAGER,4000


## 2. Sorting by Multiple Fields

### A. `SQL`
- Order of precedence in `ORDER BY` is from left to right
- Generally permitted to order by column not in `SELECT` list by explicitly naming the column but not possible if `GROUP BY` or `DISTINCT` is used

In [0]:
%sql
select empno, deptno
from emp
where deptno=10
order by empno, sal desc

empno,deptno
7782,10
7839,10
7934,10


### B. `PySpark`

In [0]:
from pyspark.sql.functions import col
display(emp.select('empno', 'deptno').filter(col('deptno')==10).orderBy(col('empno').desc(), col('sal').desc()))

empno,deptno
7934,10
7839,10
7782,10


### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()

df_emp_pandas['deptno'] = df_emp_pandas[df_emp_pandas['deptno'].notna()]['deptno'].astype('int64')

display(df_emp_pandas[df_emp_pandas['deptno']==10].sort_values(by=['empno', 'sal'], ascending=[False, False])[['empno', 'deptno']])

empno,deptno
7934,10.0
7839,10.0
7782,10.0


## 3. Sorting by Substrings

### A. `SQL`
- Sorting by last 2 characters in string
- psql uses `SUBSTR` function to get a substring where first argument is the column and second is the start of string

In [0]:
%sql
select ename, job
from emp
order by substr(job, length(job)-1)

ename,job
ALLEN,SALESMAN
WARD,SALESMAN
MARTIN,SALESMAN
TURNER,SALESMAN
YODA,JEDI
BLAKE,MANAGER
JONES,MANAGER
CLARK,MANAGER
KING,PRESIDENT
JAMES,CLERK


In [0]:
%sql
select ename, job
from emp
order by substr(job, -2)

ename,job
ALLEN,SALESMAN
WARD,SALESMAN
MARTIN,SALESMAN
TURNER,SALESMAN
YODA,JEDI
BLAKE,MANAGER
JONES,MANAGER
CLARK,MANAGER
KING,PRESIDENT
JAMES,CLERK


### B. `PySpark`
- Use negative indexes to start from last

In [0]:
from pyspark.sql.functions import col
display(emp.select('ename', 'job').orderBy(col('job').substr(-2, 2)))

ename,job
ALLEN,SALESMAN
WARD,SALESMAN
MARTIN,SALESMAN
TURNER,SALESMAN
YODA,JEDI
BLAKE,MANAGER
JONES,MANAGER
CLARK,MANAGER
KING,PRESIDENT
JAMES,CLERK


### C. `Pandas`

In [0]:
df_emp_pandas = emp.toPandas()

df_emp_pandas['sort_job'] = df_emp_pandas['job'].astype('str').str[-2:]

display(df_emp_pandas.sort_values(by='sort_job')[['ename', 'job']])

ename,job
ALLEN,SALESMAN
WARD,SALESMAN
MARTIN,SALESMAN
TURNER,SALESMAN
YODA,JEDI
BLAKE,MANAGER
JONES,MANAGER
CLARK,MANAGER
KING,PRESIDENT
JAMES,CLERK


## 4. Sorting Mixed Alphanumeric Data

### Creating a View

In [0]:
%sql
create temp view V
as
select ename||' '||deptno as data
from emp;

select * from V

data
ALLEN 30
WARD 30
MARTIN 30
BLAKE 30
TURNER 30
JAMES 30
""
""
SMITH 20
JONES 20


### `TRANSLATE(string, from, to)` function
1) **string**
is a string subjected to translation.

2) **from**
is a set of characters in the first argument (string) that should be replaced.

3) **to**
is a set of characters that replaces the from in the string.

Notice that if from is longer than to, the TRANSLATE() function removes the occurrences of the extra characters in from.

### `REPLACE(source, old_text, new_text)` function
1) **source** is a string where you want to replace.
2) **old_text** is the text that you want to search and replace. If the old_text appears multiple times in the string, all of its occurrences will be replaced.
3) **new_text** is the new text that will replace the old text (**old_text**).

> `TRANSLATE` replaces characters while `REPLACE` replaces the whole text

### Order By Deptno

In [0]:
%sql
select data
from V
order by replace(data,replace(translate(data, '0123456789', '##########'),'#',''), '')

data
""
""
MILLER 10
KING 10
CLARK 10
SMITH 20
JONES 20
SCOTT 20
ADAMS 20
FORD 20


In [0]:
%sql
select data
from V
order by replace(translate(data, '0123456789', '##########'),'#','')

data
""
""
ADAMS 20
ALLEN 30
BLAKE 30
CLARK 10
FORD 20
JAMES 30
JONES 20
KING 10


## 5. Dealing with Nulls in Sorting
- There is a little trick to use when not trying to get below results

### NON-NULL COMM SORTED ASCENDING, ALL NULLS LAST

In [0]:
%%sql
select ename, sal, comm
from emp
order by 3

 * postgresql://user_jupyter:***@127.0.0.1/Practice
14 rows affected.


ename,sal,comm
TURNER,1500,0.0
ALLEN,1600,300.0
WARD,1250,500.0
MARTIN,1250,1400.0
SCOTT,3000,
KING,5000,
ADAMS,1100,
JAMES,950,
FORD,3000,
SMITH,800,


### NON-NULL COMM SORTED DESCENDING, ALL NULLS FIRST

In [0]:
%%sql
select ename, sal, comm
from emp
order by 3 desc

 * postgresql://user_jupyter:***@127.0.0.1/Practice
14 rows affected.


ename,sal,comm
MILLER,1300,
ADAMS,1100,
JAMES,950,
FORD,3000,
SMITH,800,
JONES,2975,
BLAKE,2850,
CLARK,2450,
SCOTT,3000,
KING,5000,


### NON-NULL COMM SORTED ASCENDING, ALL NULLS FIRST

In [0]:
%%sql
select ename, sal, comm
from (
    select ename, sal, comm, case when comm is null then 0 else 1 end as is_null
    from emp
    ) a
order by is_null, comm

 * postgresql://user_jupyter:***@127.0.0.1/Practice
14 rows affected.


ename,sal,comm,is_null
MILLER,1300,,0
ADAMS,1100,,0
JAMES,950,,0
FORD,3000,,0
SMITH,800,,0
JONES,2975,,0
BLAKE,2850,,0
CLARK,2450,,0
SCOTT,3000,,0
KING,5000,,0


### NON-NULL COMM SORTED DESCENDING, ALL NULLS LAST

In [0]:
%%sql
select ename, sal, comm
from (
    select ename, sal, comm, case when comm is null then 0 else 1 end as is_null
    from emp
    ) a
order by is_null desc, comm desc

 * postgresql://user_jupyter:***@127.0.0.1/Practice
14 rows affected.


ename,sal,comm
MARTIN,1250,1400.0
WARD,1250,500.0
ALLEN,1600,300.0
TURNER,1500,0.0
SCOTT,3000,
KING,5000,
ADAMS,1100,
JAMES,950,
FORD,3000,
SMITH,800,


## 6. Sorting on a Data-Dependent Key or Sorting on Conditional Logic

In [0]:
%%sql
select ename, sal, job , comm
from emp
order by case when job='SALESMAN' then comm else sal end

 * postgresql://user_jupyter:***@127.0.0.1/Practice
14 rows affected.


ename,sal,job,comm
TURNER,1500,SALESMAN,0.0
ALLEN,1600,SALESMAN,300.0
WARD,1250,SALESMAN,500.0
SMITH,800,CLERK,
JAMES,950,CLERK,
ADAMS,1100,CLERK,
MILLER,1300,CLERK,
MARTIN,1250,SALESMAN,1400.0
CLARK,2450,MANAGER,
BLAKE,2850,MANAGER,


In [0]:
%%sql
select ename, sal, job , comm, case when job='SALESMAN' then comm else sal end as ordered
from emp
order by ordered

 * postgresql://user_jupyter:***@127.0.0.1/Practice
14 rows affected.


ename,sal,job,comm,ordered
TURNER,1500,SALESMAN,0.0,0
ALLEN,1600,SALESMAN,300.0,300
WARD,1250,SALESMAN,500.0,500
SMITH,800,CLERK,,800
JAMES,950,CLERK,,950
ADAMS,1100,CLERK,,1100
MILLER,1300,CLERK,,1300
MARTIN,1250,SALESMAN,1400.0,1400
CLARK,2450,MANAGER,,2450
BLAKE,2850,MANAGER,,2850
