In [6]:
import os
import sys
from pyspark.sql import SparkSession

# ✅ Garantir que PySpark use o Python atual
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# ✅ Garantir que Spark use o Java 17 e winutils
os.environ["JAVA_HOME"] = "C:\\java"
os.environ["HADOOP_HOME"] = "C:\\hadoop"
os.environ["PATH"] += os.pathsep + os.path.join(os.environ["HADOOP_HOME"], "bin")

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [8]:
spark = (
    SparkSession.builder
    .appName("Conhecimentos_Pyspark_02")
    .master("local[*]")
    .config("spark.sql.shuffle.partitions", "4")
    .getOrCreate()
)

In [9]:
df = spark.read.csv('Arquivos/nomes.csv', header=True, inferSchema=False, sep=';')

In [10]:
df.show()

+--------------+----------+
|          Nome|      data|
+--------------+----------+
|       CAMILLY|03/31/2002|
|        YasMIM|06/13/2002|
|Fernado pessoa|12/25/1990|
|       mariana|09/11/1985|
|       peDRASO|08/15/2022|
+--------------+----------+



In [11]:
df = df.withColumn('Nome', initcap(col('Nome')))

In [12]:
mes = udf(lambda mes: mes.split('/')[0])
dia = udf(lambda dia: dia.split('/')[1])
ano = udf(lambda ano: ano.split('/')[2])

In [13]:
df = df.withColumn('Dia',dia('data'))
df = df.withColumn('Mês',mes('data'))
df = df.withColumn('Ano',ano('data'))

In [14]:
df.show()

+--------------+----------+---+---+----+
|          Nome|      data|Dia|Mês| Ano|
+--------------+----------+---+---+----+
|       Camilly|03/31/2002| 31| 03|2002|
|        Yasmim|06/13/2002| 13| 06|2002|
|Fernado Pessoa|12/25/1990| 25| 12|1990|
|       Mariana|09/11/1985| 11| 09|1985|
|       Pedraso|08/15/2022| 15| 08|2022|
+--------------+----------+---+---+----+



In [15]:
df = df.withColumn('Data_nova', concat_ws('-','Ano','Mês','Dia'))

In [16]:
df = df.select('Nome','Data_nova')

In [17]:
df = df.withColumnRenamed('Data_nova','Data de Nascimento')

In [18]:
df.show()

+--------------+------------------+
|          Nome|Data de Nascimento|
+--------------+------------------+
|       Camilly|        2002-03-31|
|        Yasmim|        2002-06-13|
|Fernado Pessoa|        1990-12-25|
|       Mariana|        1985-09-11|
|       Pedraso|        2022-08-15|
+--------------+------------------+



In [19]:
output_path = "C:\\Users\\camil\\Documents\\Portifolio\\Pyspark\\saida_csv"

In [20]:
df.coalesce(1).write.csv(
    path=output_path,
    header=True,
    mode="overwrite",
    sep=';'
)

In [21]:
print(f"✅ Arquivo CSV salvo com sucesso em:\n{output_path}")

✅ Arquivo CSV salvo com sucesso em:
C:\Users\camil\Documents\Portifolio\Pyspark\saida_csv
