# Como criar uma partição para mês e ano, a partir de uma data 

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col, date_format
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "500mb") \
    .appName("Ex3") \
    .getOrCreate()

22/07/04 16:11:43 WARN Utils: Your hostname, computador resolves to a loopback address: 127.0.1.1; using 10.0.0.135 instead (on interface wlp2s0)
22/07/04 16:11:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/07/04 16:11:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
df = spark.read.option('nullValue', 'null').csv('./data/emp.csv', header=True, inferSchema=True)

In [4]:
df.show()

+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  2022-01-01|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  2022-01-01|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  2022-01-01|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  2022-01-05|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250|1400|    30|  2022-01-03|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  2022-01-04|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  2022-01-02|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  2022-01-02|
| 7839|  KING|PRESIDENT|null|01-11-1981|5000|null|    10|  2022-01-02|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500|   0|    30|  2022-01-02|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  2022-01-03|
| 7900

In [5]:
df.printSchema()

root
 |-- EMPNO: integer (nullable = true)
 |-- ENAME: string (nullable = true)
 |-- JOB: string (nullable = true)
 |-- MGR: integer (nullable = true)
 |-- HIREDATE: string (nullable = true)
 |-- SAL: integer (nullable = true)
 |-- COMM: integer (nullable = true)
 |-- DEPTNO: integer (nullable = true)
 |-- UPDATED_DATE: string (nullable = true)



## Modificando o formato de data de dd-MM-yyyy para yyyy-MM-dd 


In [6]:
df = df.withColumn('HIREDATE', to_date('HIREDATE', 'dd-MM-yyyy')).fillna({'HIREDATE': '9999-12-31'})

In [7]:
df.show()

+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|null|    20|  2022-01-01|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|  2022-01-01|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|  2022-01-01|
| 7566| JONES|  MANAGER|7839|1981-02-04|2975|null|    20|  2022-01-05|
| 7654|MARTIN| SALESMAN|7698|1981-09-21|1250|1400|    30|  2022-01-03|
| 7698|   SGR|  MANAGER|7839|1981-01-05|2850|null|    30|  2022-01-04|
| 7782|  RAVI|  MANAGER|7839|1981-09-06|2450|null|    10|  2022-01-02|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|null|    20|  2022-01-02|
| 7839|  KING|PRESIDENT|null|1981-11-01|5000|null|    10|  2022-01-02|
| 7844|TURNER| SALESMAN|7698|1981-08-09|1500|   0|    30|  2022-01-02|
| 7876| ADAMS|    CLERK|7788|1987-05-23|1100|null|    20|  2022-01-03|
| 7900

In [8]:
df.printSchema()

root
 |-- EMPNO: integer (nullable = true)
 |-- ENAME: string (nullable = true)
 |-- JOB: string (nullable = true)
 |-- MGR: integer (nullable = true)
 |-- HIREDATE: date (nullable = true)
 |-- SAL: integer (nullable = true)
 |-- COMM: integer (nullable = true)
 |-- DEPTNO: integer (nullable = true)
 |-- UPDATED_DATE: string (nullable = true)



## Criando duas colunas, Mês e Ano, a partir de dados da coluna HIREDATE


In [9]:
df = df.withColumn('YEAR', date_format('HIREDATE', 'yyyy')).withColumn('MONTH', date_format('HIREDATE', 'MM'))

In [10]:
df.show()

+-----+------+---------+----+----------+----+----+------+------------+----+-----+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|YEAR|MONTH|
+-----+------+---------+----+----------+----+----+------+------------+----+-----+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|null|    20|  2022-01-01|1980|   12|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|  2022-01-01|1981|   02|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|  2022-01-01|1981|   02|
| 7566| JONES|  MANAGER|7839|1981-02-04|2975|null|    20|  2022-01-05|1981|   02|
| 7654|MARTIN| SALESMAN|7698|1981-09-21|1250|1400|    30|  2022-01-03|1981|   09|
| 7698|   SGR|  MANAGER|7839|1981-01-05|2850|null|    30|  2022-01-04|1981|   01|
| 7782|  RAVI|  MANAGER|7839|1981-09-06|2450|null|    10|  2022-01-02|1981|   09|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|null|    20|  2022-01-02|1987|   04|
| 7839|  KING|PRESIDENT|null|1981-11-01|5000|null|    10|  2022-01-02|1981|   11|
| 7844|TURNER| S

In [11]:
df.printSchema()

root
 |-- EMPNO: integer (nullable = true)
 |-- ENAME: string (nullable = true)
 |-- JOB: string (nullable = true)
 |-- MGR: integer (nullable = true)
 |-- HIREDATE: date (nullable = true)
 |-- SAL: integer (nullable = true)
 |-- COMM: integer (nullable = true)
 |-- DEPTNO: integer (nullable = true)
 |-- UPDATED_DATE: string (nullable = true)
 |-- YEAR: string (nullable = true)
 |-- MONTH: string (nullable = true)



In [14]:
df.write.format('csv').partitionBy('YEAR', 'MONTH').mode('overwrite').saveAsTable('emp_part')

                                                                                