### Importação de bibliotecas e sessão do pyspark:

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pandas as pd
import functools


spark = SparkSession.builder.getOrCreate()


### Definindo schemas:

In [None]:
schema = StructType() \
    .add('Id', IntegerType(),True) \
    .add('Nome', StringType(),True) \
    .add('Email', StringType(),True) \
    .add('Data_cadastro', TimestampType(),True) \
    .add('Telefone', StringType(),True)

df_clients = spark.read.format('csv') \
    .option('header', True) \
    .schema(schema) \
    .load('./data/clients/', delimiter=';')


df_clients.show()

df_clients.take(30)


In [None]:
schema2 = StructType() \
    .add('Id', IntegerType(),True) \
    .add('Cliente_id', IntegerType(),True) \
    .add('Valor', FloatType(),True) \
    .add('Datahora', TimestampType(),True)

df_transaction_in = spark.read.format('csv') \
    .option('header', True) \
    .schema(schema2) \
    .load('./data/transaction/in/', delimiter=';')
    
df_transaction_out = spark.read.format('csv') \
    .option('header', True) \
    .schema(schema2) \
    .load('./data/transaction/out/', delimiter=';')

df_transaction_out.withColumn('valor', - df_transaction_out['valor'])

### Unindo dataframes de transações:

In [None]:
def unionAll(dfs):
    return functools.reduce(
        lambda df_transaction_in, 
        df_transaction_out: 
            df_transaction_in.union(df_transaction_out.select(df_transaction_in.columns)), dfs)


df_transaction = unionAll([df_transaction_in, df_transaction_out])
df_transaction.show()

### Normalização dos dados:

In [None]:
df_clients = df_clients.withColumn('Nome', lower(df_clients['Nome']))

df_clients = df_clients.withColumn('Nome', F.trim(df_clients.Nome))

df_clients.show(100)

### Adicionando colunas:

In [None]:
df_clients.filter(F.length(col('telefone')) < 16).show()

df_clients = df_clients \
    .withColumn('DDD', F.substring('telefone', 5, 2)) \
    .withColumn('Country_code', F.substring('telefone', 1, 3))


df_clients.show()

In [None]:

df_transaction = df_transaction \
    .withColumn('hora', F.hour(col('timeStamp'))) \
    .withColumn('minuto', F.minute(col('timeStamp'))) \
    .withColumn('segundo', F.second(col('timeStamp'))) \
    .withColumn('dia', F.to_date(col('timeStamp'))) \
    

df_transaction.show()

### Criar csv com dados tratados:

In [None]:
# df_clients.coalesce(1).write.csv('./data/clients/clients_clean')
df = pd.concat(df_clients)
print(df)

In [None]:
df_transaction.coalesce(1).write.csv('./data/transaction/transaction_clean')

### Conexão com SQL server:

In [None]:
import os
from dotenv import load_dotenv
import pyodbc
import pandas as pd

load_dotenv()

server = os.environ['SERVER']
database = os.environ['DATABASE']
username = os.environ['USERNAME']
password = os.environ['PASSWORD']

cnxn = pyodbc.connect('DRIVER={ODBC Driver 18 for SQL Server};SERVER='+server+';DATABASE='+database+';ENCRYPT=yes;UID='+username+';PWD='+ password)
cursor = cnxn.cursor()



# Definir uma função para inserir uma pessoa na tabela do SQL Server
def inserir_pessoa(pessoa):
    cursor.execute("INSERT INTO pessoas (nome, idade) VALUES (?, ?)", pessoa.Nome, pessoa.Idade)
    cursor.commit()

# Aplicar a função em cada linha do DataFrame usando foreach()
df.foreach(lambda row: inserir_pessoa(row))