# Create `emp` table

In [0]:
from pyspark.sql.functions import when, col

# Read emp csv file
emp = spark.read.csv("/FileStore/tables/emp.csv", inferSchema=True, header=True)

# Replace 'NULL' strings with actual nulls
for column in emp.columns:
    emp = emp.withColumn(column, when(col(column)=="NULL", None).otherwise(col(column)))

# Create Temporary view for emp
emp.createOrReplaceTempView("emp")

# Create `dept` table

In [0]:
from pyspark.sql.functions import when, col

# Read emp csv file
dept = spark.read.csv("/FileStore/tables/dept.csv", inferSchema=True, header=True)

# Replace 'NULL' strings with actual nulls
for column in dept.columns:
    dept = dept.withColumn(column, when(col(column)=="NULL", None).otherwise(col(column)))

# Create Temporary view for emp
dept.createOrReplaceTempView("dept")

# Queries

## 1. Retrieving All Rows and Columns from a Table

### A. `SQL` 
- `*` has special meaning in SQL - Returns every column for table
- Better to specify all columns for better readability for other people and for debugging purposes in case of change in data

In [0]:
%sql 
SELECT * FROM EMP

empno,ename,job,mgr,hiredate,sal,comm,deptno
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600.0,300.0,30.0
7521,WARD,SALESMAN,7698.0,1981-02-22,1250.0,500.0,30.0
7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250.0,1400.0,30.0
7698,BLAKE,MANAGER,7839.0,1981-05-01,2850.0,,30.0
7844,TURNER,SALESMAN,7698.0,1981-09-08,1500.0,0.0,30.0
7900,JAMES,CLERK,7698.0,1981-12-03,950.0,,30.0
1111,YODA,JEDI,,1981-11-17,5000.0,,
1,Jonathan,Editor,,,,,
7369,SMITH,CLERK,7902.0,1980-12-17,880.0,,20.0
7566,JONES,MANAGER,7839.0,1981-04-02,3273.0,,20.0


### B. `PySpark`
  - Use `show` method function

In [0]:
emp.show()

+-----+--------+---------+----+----------+----+----+------+
|empno|   ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+--------+---------+----+----------+----+----+------+
| 7499|   ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7521|    WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|
| 7654|  MARTIN| SALESMAN|7698|1981-09-28|1250|1400|    30|
| 7698|   BLAKE|  MANAGER|7839|1981-05-01|2850|null|    30|
| 7844|  TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|
| 7900|   JAMES|    CLERK|7698|1981-12-03| 950|null|    30|
| 1111|    YODA|     JEDI|null|1981-11-17|5000|null|  null|
|    1|Jonathan|   Editor|null|      null|null|null|  null|
| 7369|   SMITH|    CLERK|7902|1980-12-17| 880|null|    20|
| 7566|   JONES|  MANAGER|7839|1981-04-02|3273|null|    20|
| 7788|   SCOTT|  ANALYST|7566|1982-12-09|3300|null|    20|
| 7876|   ADAMS|    CLERK|7788|1983-01-12|1210|null|    20|
| 7902|    FORD|  ANALYST|7566|1981-12-03|3300|null|    20|
| 7934|  MILLER|    CLERK|7782|1982-01-2

### C. `Pandas`
- Use `head` method or simply return the dataframe

In [0]:
emp.toPandas()

Unnamed: 0,empno,ename,job,mgr,hiredate,sal,comm,deptno
0,7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600.0,300.0,30.0
1,7521,WARD,SALESMAN,7698.0,1981-02-22,1250.0,500.0,30.0
2,7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250.0,1400.0,30.0
3,7698,BLAKE,MANAGER,7839.0,1981-05-01,2850.0,,30.0
4,7844,TURNER,SALESMAN,7698.0,1981-09-08,1500.0,0.0,30.0
5,7900,JAMES,CLERK,7698.0,1981-12-03,950.0,,30.0
6,1111,YODA,JEDI,,1981-11-17,5000.0,,
7,1,Jonathan,Editor,,,,,
8,7369,SMITH,CLERK,7902.0,1980-12-17,880.0,,20.0
9,7566,JONES,MANAGER,7839.0,1981-04-02,3273.0,,20.0


## 2. Retrieving a Subset of Rows from a Table

### A. `SQL`
- Use `WHERE` clause to specify which rows to keep

In [0]:
%sql
select * from EMP where deptno=10

empno,ename,job,mgr,hiredate,sal,comm,deptno
7934,MILLER,CLERK,7782.0,1982-01-23,4000,2000,10
7839,KING,PRESIDENT,,1981-11-17,4000,2000,10
7782,CLARK,MANAGER,7839.0,1981-06-09,4000,2000,10


In [0]:
%sql
describe emp

col_name,data_type,comment
empno,int,
ename,string,
job,string,
mgr,string,
hiredate,string,
sal,string,
comm,string,
deptno,string,


### B. `PySpark`
  - Use `filter` or `where` 
  - datatypes dont matter much like SQL

In [0]:
emp.filter(emp.deptno==10).show()

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7934|MILLER|    CLERK|7782|1982-01-23|4000|2000|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|4000|2000|    10|
| 7782| CLARK|  MANAGER|7839|1981-06-09|4000|2000|    10|
+-----+------+---------+----+----------+----+----+------+



In [0]:
emp.filter(emp.deptno=='10').show()

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7934|MILLER|    CLERK|7782|1982-01-23|4000|2000|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|4000|2000|    10|
| 7782| CLARK|  MANAGER|7839|1981-06-09|4000|2000|    10|
+-----+------+---------+----+----------+----+----+------+



In [0]:
emp.dtypes

Out[17]: [('empno', 'int'),
 ('ename', 'string'),
 ('job', 'string'),
 ('mgr', 'string'),
 ('hiredate', 'string'),
 ('sal', 'string'),
 ('comm', 'string'),
 ('deptno', 'string')]

### C. `Pandas`
- Use `masks`
- Be careful while using coverting as datatypes may change

In [0]:
df_emp = emp.toPandas()
df_emp[df_emp['deptno']=='10']

Unnamed: 0,empno,ename,job,mgr,hiredate,sal,comm,deptno
13,7934,MILLER,CLERK,7782.0,1982-01-23,4000,2000,10
14,7839,KING,PRESIDENT,,1981-11-17,4000,2000,10
15,7782,CLARK,MANAGER,7839.0,1981-06-09,4000,2000,10


In [0]:
df_emp.dtypes

Out[16]: empno        int32
ename       object
job         object
mgr         object
hiredate    object
sal         object
comm        object
deptno      object
dtype: object

## 3. Finding Rows that Satisfy Multiple Columns

### A. `SQL` 
- Find all employees in department 10, along with any employees who earn commission, along with any employees in department 20 who earn at most $2000

In [0]:
%sql
select * 
    from EMP
where
    deptno = 10
    or comm is not null
    or (deptno=20 and sal <=2000)

empno,ename,job,mgr,hiredate,sal,comm,deptno
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300.0,30
7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500.0,30
7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250,1400.0,30
7844,TURNER,SALESMAN,7698.0,1981-09-08,1500,0.0,30
7369,SMITH,CLERK,7902.0,1980-12-17,880,,20
7876,ADAMS,CLERK,7788.0,1983-01-12,1210,,20
7934,MILLER,CLERK,7782.0,1982-01-23,4000,2000.0,10
7839,KING,PRESIDENT,,1981-11-17,4000,2000.0,10
7782,CLARK,MANAGER,7839.0,1981-06-09,4000,2000.0,10


### B. `PySpark`

In [0]:
emp.filter( (emp.deptno==10) | (emp.comm.isNotNull()) | ( (emp.deptno==20) & (emp.sal <=2000) )).show()

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250|1400|    30|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|
| 7369| SMITH|    CLERK|7902|1980-12-17| 880|null|    20|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1210|null|    20|
| 7934|MILLER|    CLERK|7782|1982-01-23|4000|2000|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|4000|2000|    10|
| 7782| CLARK|  MANAGER|7839|1981-06-09|4000|2000|    10|
+-----+------+---------+----+----------+----+----+------+



### C. `Pandas`
- sal column is converted to integer and then worked upon

In [0]:
# Convert sal  to number
df_emp['sal'] = df_emp[df_emp['sal'].notna()]['sal'].astype('int64')

df_emp[ (df_emp.deptno=='10') | (df_emp.comm.notna()) | ( (df_emp.deptno=='20') & (df_emp.sal <=2000) )]

Unnamed: 0,empno,ename,job,mgr,hiredate,sal,comm,deptno
0,7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600.0,300.0,30
1,7521,WARD,SALESMAN,7698.0,1981-02-22,1250.0,500.0,30
2,7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250.0,1400.0,30
4,7844,TURNER,SALESMAN,7698.0,1981-09-08,1500.0,0.0,30
8,7369,SMITH,CLERK,7902.0,1980-12-17,880.0,,20
11,7876,ADAMS,CLERK,7788.0,1983-01-12,1210.0,,20
13,7934,MILLER,CLERK,7782.0,1982-01-23,4000.0,2000.0,10
14,7839,KING,PRESIDENT,,1981-11-17,4000.0,2000.0,10
15,7782,CLARK,MANAGER,7839.0,1981-06-09,4000.0,2000.0,10


## 4. Retrieving a Subset of Columns from a Table

### A. `SQL` 
- Ensures no extraneous data is returned
- Optimizes performance

In [0]:
%sql
select ename, deptno, sal
from emp

ename,deptno,sal
ALLEN,30.0,1600.0
WARD,30.0,1250.0
MARTIN,30.0,1250.0
BLAKE,30.0,2850.0
TURNER,30.0,1500.0
JAMES,30.0,950.0
YODA,,5000.0
Jonathan,,
SMITH,20.0,880.0
JONES,20.0,3273.0


### B. `PySpark`

In [0]:
emp.select('ename', 'deptno', 'sal').show()

+--------+------+----+
|   ename|deptno| sal|
+--------+------+----+
|   ALLEN|    30|1600|
|    WARD|    30|1250|
|  MARTIN|    30|1250|
|   BLAKE|    30|2850|
|  TURNER|    30|1500|
|   JAMES|    30| 950|
|    YODA|  null|5000|
|Jonathan|  null|null|
|   SMITH|    20| 880|
|   JONES|    20|3273|
|   SCOTT|    20|3300|
|   ADAMS|    20|1210|
|    FORD|    20|3300|
|  MILLER|    10|4000|
|    KING|    10|4000|
|   CLARK|    10|4000|
+--------+------+----+



### C. `Pandas`

In [0]:
emp.toPandas()[['ename', 'deptno', 'sal']]

Unnamed: 0,ename,deptno,sal
0,ALLEN,30.0,1600.0
1,WARD,30.0,1250.0
2,MARTIN,30.0,1250.0
3,BLAKE,30.0,2850.0
4,TURNER,30.0,1500.0
5,JAMES,30.0,950.0
6,YODA,,5000.0
7,Jonathan,,
8,SMITH,20.0,880.0
9,JONES,20.0,3273.0


## 5. Providing Meaningful Names for Columns

### A.`SQL`
- Meaningful names are more readable and understandable
- Using `as` keyword is known as aliasing columns. Creating good aliases can make query and its results understandable to others

In [0]:
%sql
select sal as salary, comm as commission
from emp

salary,commission
1600.0,300.0
1250.0,500.0
1250.0,1400.0
2850.0,
1500.0,0.0
950.0,
5000.0,
,
880.0,
3273.0,


### B.`PySpark`
- Use `alias`
- Or columns can be renamed by `withColumnRenamed`

In [0]:
display(emp.select(emp.sal.alias('salary'), emp.comm.alias('commission')))

salary,commission
1600.0,300.0
1250.0,500.0
1250.0,1400.0
2850.0,
1500.0,0.0
950.0,
5000.0,
,
880.0,
3273.0,


In [0]:
df_emp = emp.withColumnRenamed('sal', 'salary')
df_emp = df_emp.withColumnRenamed('comm', 'commission')
display(df_emp.select('salary', 'commission'))

salary,commission
1600.0,300.0
1250.0,500.0
1250.0,1400.0
2850.0,
1500.0,0.0
950.0,
5000.0,
,
880.0,
3273.0,


### C.`Pandas`

In [0]:
column_names = ['salary', 'commission']
df_emp = emp.select('sal', 'comm').toPandas()
df_emp.columns = column_names
display(df_emp)

salary,commission
1600.0,300.0
1250.0,500.0
1250.0,1400.0
2850.0,
1500.0,0.0
950.0,
5000.0,
,
880.0,
3273.0,


## 6. Referencing an Aliased Column in the Where clause

### A. `SQL`
- Need to use subqueries. otherwise will throw error
- `WHERE` clause is evaluated before the `SELECT`
- Alisases for Columns are therefore not evaluated until after `WHERE` clause
- Inline view is aliased `x`

In [0]:
%sql
select *
from (
    select sal as salary, comm as commission
    from EMP
    ) x
where salary < 5000

salary,commission
1600,300.0
1250,500.0
1250,1400.0
2850,
1500,0.0
950,
880,
3273,
3300,
1210,


### B. `PySpark`

In [0]:
spark_df_emp = emp.select(emp.sal.alias('salary'), emp.comm.alias('commission'))
spark_df_emp.filter(spark_df_emp.salary<5000).display()

salary,commission
1600,300.0
1250,500.0
1250,1400.0
2850,
1500,0.0
950,
880,
3273,
3300,
1210,


### C. `Pandas`
- Work by renaming columns and then by filtering

In [0]:
pandas_df_emp = emp.toPandas()
pandas_df_emp.rename(columns={'sal': 'salary', 'comm': 'commission'}, inplace=True)

pandas_df_emp['salary'] = pandas_df_emp[pandas_df_emp['salary'].notna()]['salary'].astype('int64')

display(pandas_df_emp[pandas_df_emp['salary']<5000][['salary', 'commission']])

salary,commission
1600.0,300.0
1250.0,500.0
1250.0,1400.0
2850.0,
1500.0,0.0
950.0,
880.0,
3273.0,
3300.0,
1210.0,


## 7. Concatenating Column Values

### A. `SQL`
- psql uses `||` as concatenation operator

In [0]:
%sql
select ename||' WORKS AS A '|| job as msg
from emp
where deptno=10

msg
MILLER WORKS AS A CLERK
KING WORKS AS A PRESIDENT
CLARK WORKS AS A MANAGER


### B. `PySpark`

In [0]:
from pyspark.sql.functions import concat, col, lit
display(emp.filter(col('deptno')==10).select(concat(col('ename'), lit(' WORKS AS A '), col('job')).alias('msg')))

msg
MILLER WORKS AS A CLERK
KING WORKS AS A PRESIDENT
CLARK WORKS AS A MANAGER


### C. `Pandas`

In [0]:
pandas_df_emp = emp.toPandas()
pandas_df_emp['msg'] = pandas_df_emp['ename'] + ' WORKS AS A ' + pandas_df_emp['job']
display(pandas_df_emp[pandas_df_emp['deptno']=='10'][['msg']])

msg
MILLER WORKS AS A CLERK
KING WORKS AS A PRESIDENT
CLARK WORKS AS A MANAGER


## 8. Use Conditional Logic in SELECT statement

### A. `SQL`
- Performing `IF-ELSE` operations on values in `SELECT` statement
- `ELSE` clause is optional. If omitted, `CASE` expression will return `NULL` for any row that doesn't satisfy the condition

In [0]:
%sql
select ename, sal,
    case when sal<=2000 then 'UNDERPAID'
    when sal>=4000 then 'OVERPAID'
    else 'OK' 
    end as status
from emp

ename,sal,status
ALLEN,1600.0,UNDERPAID
WARD,1250.0,UNDERPAID
MARTIN,1250.0,UNDERPAID
BLAKE,2850.0,OK
TURNER,1500.0,UNDERPAID
JAMES,950.0,UNDERPAID
YODA,5000.0,OVERPAID
Jonathan,,OK
SMITH,880.0,UNDERPAID
JONES,3273.0,OK


In [0]:
%sql
select ename, sal,
    case when sal<=2000 then 'UNDERPAID'
    when sal>=4000 then 'OVERPAID'
--    else 'OK' 
    end as status
from emp

ename,sal,status
ALLEN,1600.0,UNDERPAID
WARD,1250.0,UNDERPAID
MARTIN,1250.0,UNDERPAID
BLAKE,2850.0,
TURNER,1500.0,UNDERPAID
JAMES,950.0,UNDERPAID
YODA,5000.0,OVERPAID
Jonathan,,
SMITH,880.0,UNDERPAID
JONES,3273.0,


### B. `PySpark`
- Use `when` and `otherwise`

In [0]:
from pyspark.sql.functions import when, col, lit
display(emp.withColumn( 'status', when(col('sal')<=2000, 'UNDERPAID').when(col('sal')>=4000, 'OVERPAID').otherwise('OK')).select('ename', 'sal', 'status'))

ename,sal,status
ALLEN,1600.0,UNDERPAID
WARD,1250.0,UNDERPAID
MARTIN,1250.0,UNDERPAID
BLAKE,2850.0,OK
TURNER,1500.0,UNDERPAID
JAMES,950.0,UNDERPAID
YODA,5000.0,OVERPAID
Jonathan,,OK
SMITH,880.0,UNDERPAID
JONES,3273.0,OK


In [0]:
from pyspark.sql.functions import when, col, lit
display(emp.withColumn( 'status', when(col('sal')<=2000, 'UNDERPAID').when(col('sal')>=4000, 'OVERPAID')).select('ename', 'sal', 'status'))

ename,sal,status
ALLEN,1600.0,UNDERPAID
WARD,1250.0,UNDERPAID
MARTIN,1250.0,UNDERPAID
BLAKE,2850.0,
TURNER,1500.0,UNDERPAID
JAMES,950.0,UNDERPAID
YODA,5000.0,OVERPAID
Jonathan,,
SMITH,880.0,UNDERPAID
JONES,3273.0,


### C. `Pandas`

In [0]:
pandas_emp_df = emp.toPandas()

pandas_emp_df['sal'] = pandas_emp_df[pandas_emp_df['sal'].notna()]['sal'].astype('int64')

pandas_emp_df['status'] = pandas_emp_df['sal'].apply( lambda x: 'UNDERPAID' if x<=2000 else 'OVERPAID' if x>=4000 else 'OK')
display(pandas_emp_df[['ename', 'sal', 'status']])

ename,sal,status
ALLEN,1600.0,UNDERPAID
WARD,1250.0,UNDERPAID
MARTIN,1250.0,UNDERPAID
BLAKE,2850.0,OK
TURNER,1500.0,UNDERPAID
JAMES,950.0,UNDERPAID
YODA,5000.0,OVERPAID
Jonathan,,OK
SMITH,880.0,UNDERPAID
JONES,3273.0,OK


## 9. Limiting the Number of Rows Returned

### A. `SQL` 
- psql uses `LIMIT` keyword

In [0]:
%sql
select *
from EMP
limit 5

empno,ename,job,mgr,hiredate,sal,comm,deptno
7499,ALLEN,SALESMAN,7698,1981-02-20,1600,300.0,30
7521,WARD,SALESMAN,7698,1981-02-22,1250,500.0,30
7654,MARTIN,SALESMAN,7698,1981-09-28,1250,1400.0,30
7698,BLAKE,MANAGER,7839,1981-05-01,2850,,30
7844,TURNER,SALESMAN,7698,1981-09-08,1500,0.0,30


### B. `PySpark`

In [0]:
display(emp.head(5))

empno,ename,job,mgr,hiredate,sal,comm,deptno
7499,ALLEN,SALESMAN,7698,1981-02-20,1600,300.0,30
7521,WARD,SALESMAN,7698,1981-02-22,1250,500.0,30
7654,MARTIN,SALESMAN,7698,1981-09-28,1250,1400.0,30
7698,BLAKE,MANAGER,7839,1981-05-01,2850,,30
7844,TURNER,SALESMAN,7698,1981-09-08,1500,0.0,30


### C. `Pandas`

In [0]:
display(emp.toPandas().head(5))

empno,ename,job,mgr,hiredate,sal,comm,deptno
7499,ALLEN,SALESMAN,7698,1981-02-20,1600,300.0,30
7521,WARD,SALESMAN,7698,1981-02-22,1250,500.0,30
7654,MARTIN,SALESMAN,7698,1981-09-28,1250,1400.0,30
7698,BLAKE,MANAGER,7839,1981-05-01,2850,,30
7844,TURNER,SALESMAN,7698,1981-09-08,1500,0.0,30


## 10. Returning `n` Random Records from a Table

### A. `SQL`
- psql uses `RANDOM()` function in conjunction with `ORDER BY` and `LIMIT`
- `ORDER BY` clause can accept a function's return value and use it to change the order of result set

In [0]:
%sql
select ename, job
from emp
order by random() limit 5

ename,job
YODA,JEDI
BLAKE,MANAGER
SMITH,CLERK
SCOTT,ANALYST
ALLEN,SALESMAN


#### Here the function is `RANDOM`

In [0]:
%sql
select ename, random()
from emp

ename,rand()
ALLEN,0.2487228807619588
WARD,0.9520835282264356
MARTIN,0.8286903920452814
BLAKE,0.9445604480852378
TURNER,0.7375197132338616
JAMES,0.8953919839428858
YODA,0.8641300471501976
Jonathan,0.1670131462564733
SMITH,0.4889015133068058
JONES,0.7672532955725246


- Numeric constant in `ORDER BY` clause if used means that sorting is done according to the column in that ordinal position in the `SELECT` list
- When function is specified in `ORDER BY` clause, sort is performed on the result from the function as it is evaluated for each row

In [0]:
%sql
select *
from EMP
order by 1

empno,ename,job,mgr,hiredate,sal,comm,deptno
1,Jonathan,Editor,,,,,
1111,YODA,JEDI,,1981-11-17,5000.0,,
7369,SMITH,CLERK,7902.0,1980-12-17,880.0,,20.0
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600.0,300.0,30.0
7521,WARD,SALESMAN,7698.0,1981-02-22,1250.0,500.0,30.0
7566,JONES,MANAGER,7839.0,1981-04-02,3273.0,,20.0
7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250.0,1400.0,30.0
7698,BLAKE,MANAGER,7839.0,1981-05-01,2850.0,,30.0
7782,CLARK,MANAGER,7839.0,1981-06-09,4000.0,2000.0,10.0
7788,SCOTT,ANALYST,7566.0,1982-12-09,3300.0,,20.0


In [0]:
%sql
select empno, sal+coalesce(comm, 0) as income
from emp
order by income

empno,income
1,
7369,880.0
7900,950.0
7876,1210.0
7844,1500.0
7521,1750.0
7499,1900.0
7654,2650.0
7698,2850.0
7566,3273.0


### B. `PySpark`
- Use `rand` function

In [0]:
from pyspark.sql.functions import rand
display(emp.select('ename', 'job').orderBy(rand().asc()).head(5))

ename,job
BLAKE,MANAGER
Jonathan,Editor
FORD,ANALYST
MILLER,CLERK
TURNER,SALESMAN


In [0]:
from pyspark.sql.functions import rand
display(emp.select('ename', 'job').orderBy(rand().desc()).head(5))

ename,job
WARD,SALESMAN
SMITH,CLERK
SCOTT,ANALYST
KING,PRESIDENT
BLAKE,MANAGER


- By a column(s)

In [0]:
display(emp.select('ename', 'job').orderBy(col('sal').desc(), col('comm').asc()).head(5))

ename,job
JAMES,CLERK
SMITH,CLERK
YODA,JEDI
MILLER,CLERK
KING,PRESIDENT


### C. `Pandas`

In [0]:
import numpy as np
pandas_emp_df = emp.toPandas()

pandas_emp_df['rand'] = np.random.rand(pandas_emp_df.shape[0])
display(pandas_emp_df.sort_values(by=['rand'], ascending=[True])[['ename', 'job']].head(5))

ename,job
KING,PRESIDENT
Jonathan,Editor
JAMES,CLERK
FORD,ANALYST
TURNER,SALESMAN


- By a column(s)

In [0]:
import numpy as np
pandas_emp_df = emp.toPandas()

pandas_emp_df['sal'] = pandas_emp_df[pandas_emp_df['sal'].notna()]['sal'].astype('int64')
pandas_emp_df['comm'] = pandas_emp_df[pandas_emp_df['comm'].notna()]['comm'].astype('int64')

display(pandas_emp_df.sort_values(by=['sal', 'comm'], ascending=[True, False])[['ename', 'job']].head(5))

ename,job
SMITH,CLERK
JAMES,CLERK
ADAMS,CLERK
MARTIN,SALESMAN
WARD,SALESMAN


## 11. Finding Null Values

### A. `SQL`
- `NULL` is never equal/not equal to anything, not even itself. Therefore, you cannot use `=` or `!=` for testing whether a column value is `NULL`
- To test, need to use `IS NULL` or `IS NOT NULL` to find rows without a null in the given column

In [0]:
%sql
select *
from EMP
where comm is not null

empno,ename,job,mgr,hiredate,sal,comm,deptno
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300,30
7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500,30
7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250,1400,30
7844,TURNER,SALESMAN,7698.0,1981-09-08,1500,0,30
7934,MILLER,CLERK,7782.0,1982-01-23,4000,2000,10
7839,KING,PRESIDENT,,1981-11-17,4000,2000,10
7782,CLARK,MANAGER,7839.0,1981-06-09,4000,2000,10


### B. `PySpark`

In [0]:
display(emp.filter(emp.comm.isNotNull()))

empno,ename,job,mgr,hiredate,sal,comm,deptno
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300,30
7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500,30
7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250,1400,30
7844,TURNER,SALESMAN,7698.0,1981-09-08,1500,0,30
7934,MILLER,CLERK,7782.0,1982-01-23,4000,2000,10
7839,KING,PRESIDENT,,1981-11-17,4000,2000,10
7782,CLARK,MANAGER,7839.0,1981-06-09,4000,2000,10


### C. `Python`

In [0]:
pandas_emp_df = emp.toPandas()
display(pandas_emp_df[pandas_emp_df['comm'].notna()])

empno,ename,job,mgr,hiredate,sal,comm,deptno
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300,30
7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500,30
7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250,1400,30
7844,TURNER,SALESMAN,7698.0,1981-09-08,1500,0,30
7934,MILLER,CLERK,7782.0,1982-01-23,4000,2000,10
7839,KING,PRESIDENT,,1981-11-17,4000,2000,10
7782,CLARK,MANAGER,7839.0,1981-06-09,4000,2000,10


## 12. Transforming Nulls into Real Values

### A. `SQL`
- `COALESCE` function takes one or more values as arguments
- function returns the first non-null value in the list
- `CASE` can also be used
- Easier and much succint to use `COALESCE`

In [0]:
%sql
select coalesce(comm,0) as commision
from emp

commision
300
500
1400
0
0
0
0
0
0
0


In [0]:
%sql
select coalesce(comm) as commission
from emp
order by commission

commission
""
""
""
""
""
""
""
""
""
0.0


In [0]:
%sql
select case when comm is null then 0
    else comm
    end as commission
from emp

commission
300
500
1400
0
0
0
0
0
0
0


### B. `PySpark`

In [0]:
from pyspark.sql.functions import coalesce

## 13. Searching for Patterns
- Use `LIKE` operator in conjunction with SQL wildcard operator(`%`)
- When used in `LIKE` pattern-match operation, the percent(`%`) operator matches any sequence of characters
- **Important**: Position of `%` matters

In [0]:
%%sql
select ename, job
from emp
where 
    deptno in (10,20)
    and (ename like '%I%' or job like '%ER')

 * postgresql://user_jupyter:***@127.0.0.1/Practice
5 rows affected.


ename,job
SMITH,CLERK
JONES,MANAGER
MILLER,CLERK
KING,PRESIDENT
CLARK,MANAGER
