# Gerando uma sequência de id/surrogate key

In [31]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [7]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "500mb") \
    .appName("Ex7") \
    .getOrCreate()

In [40]:
df = spark.read.option(
    'nullValue', 'null').csv(
    './data/emp.csv', header=True, inferSchema=True).dropna(
    how='all').dropDuplicates(['EMPNO'])

In [41]:
df.show()



+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  2022-01-01|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  2022-01-02|
| 7902|  FORD|  ANALYST|7566|12-03-1981|3000|null|    20|  2022-01-03|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  2022-01-05|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  2022-01-03|
| 1234|SEKHAR|   doctor|7777|      null| 667|  78|    80|  2022-01-03|
| 7900| JAMES|    CLERK|7698|12-03-1981| 950|null|    30|  2022-01-03|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  2022-01-04|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  2022-01-01|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250|1400|    30|  2022-01-03|
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  2022-01-01|
| 7839



## Com window function

In [42]:
df1 = df.withColumn('row_number', row_number().over(Window.partitionBy(lit('')).orderBy(lit(''))))

In [43]:
df1.show()

                                                                                

+-----+------+---------+----+----------+----+----+------+------------+----------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|row_number|
+-----+------+---------+----+----------+----+----+------+------------+----------+
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  2022-01-01|         1|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  2022-01-02|         2|
| 7902|  FORD|  ANALYST|7566|12-03-1981|3000|null|    20|  2022-01-03|         3|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  2022-01-05|         4|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  2022-01-03|         5|
| 1234|SEKHAR|   doctor|7777|      null| 667|  78|    80|  2022-01-03|         6|
| 7900| JAMES|    CLERK|7698|12-03-1981| 950|null|    30|  2022-01-03|         7|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  2022-01-04|         8|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  2022-01-01|         9|
| 7654|MARTIN| S

## Com crc32

In [44]:
df2 = df.withColumn('crc32_key', crc32(col('EMPNO').cast('string')))

In [45]:
df2.show()

+-----+------+---------+----+----------+----+----+------+------------+----------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE| crc32_key|
+-----+------+---------+----+----------+----+----+------+------------+----------+
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  2022-01-01|1046173690|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  2022-01-02|1375856756|
| 7902|  FORD|  ANALYST|7566|12-03-1981|3000|null|    20|  2022-01-03|1888724584|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  2022-01-05| 683555987|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  2022-01-03| 964533889|
| 1234|SEKHAR|   doctor|7777|      null| 667|  78|    80|  2022-01-03|2615402659|
| 7900| JAMES|    CLERK|7698|12-03-1981| 950|null|    30|  2022-01-03|2661140292|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  2022-01-04|1255715586|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  2022-01-01|3535170612|
| 7654|MARTIN| S

# Com md5

In [46]:
df3 = df.withColumn('md5_key', md5(col('EMPNO').cast('string')))

In [47]:
df3.show()



+-----+------+---------+----+----------+----+----+------+------------+--------------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|             md5_key|
+-----+------+---------+----+----------+----+----+------+------------+--------------------+
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  2022-01-01|7a2b33c672ce223b2...|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  2022-01-02|866c7ee013c58f01f...|
| 7902|  FORD|  ANALYST|7566|12-03-1981|3000|null|    20|  2022-01-03|66fe2bcc701bb627e...|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  2022-01-05|b937384a573b94c4d...|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  2022-01-03|42dab56861d81108e...|
| 1234|SEKHAR|   doctor|7777|      null| 667|  78|    80|  2022-01-03|81dc9bdb52d04dc20...|
| 7900| JAMES|    CLERK|7698|12-03-1981| 950|null|    30|  2022-01-03|400c3241004b5db7c...|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  2022-01-04|c570c225d1

# Com sha2

In [52]:
df4 = df.withColumn('sha2_key', sha2(col('EMPNO').cast('string'), 256))

In [53]:
df4.show()

+-----+------+---------+----+----------+----+----+------+------------+--------------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|            sha2_key|
+-----+------+---------+----+----------+----+----+------+------------+--------------------+
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  2022-01-01|4427dc2e32a1d099d...|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  2022-01-02|16740bf13991fe083...|
| 7902|  FORD|  ANALYST|7566|12-03-1981|3000|null|    20|  2022-01-03|7b721c50ff0220bb9...|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  2022-01-05|b2ca4f93866dc5f5a...|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  2022-01-03|05a4cd58579909328...|
| 1234|SEKHAR|   doctor|7777|      null| 667|  78|    80|  2022-01-03|03ac674216f3e15c7...|
| 7900| JAMES|    CLERK|7698|12-03-1981| 950|null|    30|  2022-01-03|22191a9215264bb24...|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  2022-01-04|83f9d8d707