### Importação das bibliotecas

In [0]:

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, sum, regexp_replace, when


In [0]:

spark = SparkSession.builder \
    .appName("Tratamento de Dados Database Type Ecommerce UI Bakery") \
    .getOrCreate()

### Funções

In [0]:
def exibir_info_df(df: DataFrame) -> None:
    """
    Imprime o número de linhas, número de colunas e o esquema do DataFrame fornecido.

    Parâmetros:
        df (DataFrame): O DataFrame a ser analisado.

    Returns:
        None
    """

    print(f'\nLinhas = {df.count()} \nColunas = {len(df.columns)}\n')
    df.printSchema()


In [0]:
def verificar_dados_nulos(df: DataFrame) -> None:
    """
    Função que recebe um DataFrame e imprime a contagem numérica de valores nulos em cada coluna.

    Parâmetros:
        df (DataFrame): O DataFrame a ser analisado.

    Returns:
        None
    """

    lista = []

    for i in df.columns:
        contagem_nulos = sum(col(i).isNull().cast("int")).alias(i)
        lista.append(contagem_nulos)

    df.agg(*lista).display()

### Tratamento dos Dados

#### Customers

In [0]:
df_customers = spark.read.format('delta').load('/join/bronze/df_customers')

exibir_info_df(df_customers)




Linhas = 122 
Colunas = 13

root
 |-- customer_number: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- contact_last_name: string (nullable = true)
 |-- contact_first_name: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- address_line1: string (nullable = true)
 |-- address_line2: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- sales_rep_employee_number: integer (nullable = true)
 |-- credit_limit: decimal(10,2) (nullable = true)



In [0]:
df_customers.limit(5).display()

customer_number,customer_name,contact_last_name,contact_first_name,phone,address_line1,address_line2,city,state,postal_code,country,sales_rep_employee_number,credit_limit
103,Jake,King,Carine,40.32.2555,"54, rue Royale",,Nantes,Victoria,44000,France,1370,21000.0
112,Signal Gift Store,King,Jean,7025551838,8489 Strong St.,,Las Vegas,New York,83030,USA,1166,71800.0
114,"Australian Collectors, Co.",Ferguson,Peter Sr.,03 9520 4555,636 St Kilda Road,Level 3,Melbourne,Victoria,3004,Australia,1611,117300.0
119,La Rochelle Gifts,Labrune,Janine,40.67.8555,"67, rue des Cinquante Otages",,Nantes,,44000,France,1370,118200.0
121,Baane Mini Imports,Bergulfsen,Jonas,07-98 9555,Erling Skakkes gate 78,,Stavern,,4110,Norway,1504,81700.0


In [0]:
verificar_dados_nulos(df_customers)

customer_number,customer_name,contact_last_name,contact_first_name,phone,address_line1,address_line2,city,state,postal_code,country,sales_rep_employee_number,credit_limit
0,0,0,0,0,0,100,0,72,7,0,22,0


- Coluna <b><i>address_line2</b></i>:  Removendo devido a quantidade de valores nulos

In [0]:
df_customers = df_customers.drop('address_line2')

- Coluna <b><i>state</b></i>: Substituindo os valores null por Não informado (Uninformed)

In [0]:
df_customers = df_customers.na.fill({'state': 'Uninformed'})

- Coluna <i><b>sales_rep_employee_number</b></i>: Preenchendo os dados nulos com um valor padrão <b>9999 </b>

In [0]:
df_customers.filter('sales_rep_employee_number is Null').limit(5).display()

customer_number,customer_name,contact_last_name,contact_first_name,phone,address_line1,city,state,postal_code,country,sales_rep_employee_number,credit_limit
125,Havel & Zbyszek Co,Piestrzeniewicz,Zbyszek,(26) 642-7555,ul. Filtrowa 68,Warszawa,Uninformed,01-012,Poland,,0.0
169,Porto Imports Co.,de Castro,Isabel,(1) 356-5555,Estrada da sa�de n. 58,Lisboa,Uninformed,1756,Portugal,,0.0
206,"Asian Shopping Network, Co",Walker,Brydey,+612 9411 1555,Suntec Tower Three,Singapore,Uninformed,038988,Singapore,,0.0
223,Nat�rlich Autos,Kloss,Horst,0372-555188,Taucherstra�e 10,Cunewalde,Uninformed,01307,Germany,,0.0
237,ANG Resellers,Camino,Alejandra,(91) 745 6555,"Gran V�a, 1",Madrid,Uninformed,28001,Spain,,0.0


- Coluna <b><i>postal_code</b></i>: Substituindo os valores null por Não informado (Uninformed)

In [0]:
df_customers = df_customers.na.fill({'postal_code': 'Uninformed'})

In [0]:
verificar_dados_nulos(df_customers)

customer_number,customer_name,contact_last_name,contact_first_name,phone,address_line1,city,state,postal_code,country,sales_rep_employee_number,credit_limit
0,0,0,0,0,0,0,0,0,0,22,0


In [0]:
df_customers.write.mode('overwrite').format('delta').option('mergeSchema', 'True').save('/join/silver/df_customers')

#### Employees

In [0]:
df_employees = spark.read.format('delta').load('/join/bronze/df_employees')

exibir_info_df(df_employees)




Linhas = 23 
Colunas = 8

root
 |-- employee_number: integer (nullable = true)
 |-- last_name: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- extension: string (nullable = true)
 |-- email: string (nullable = true)
 |-- office_code: string (nullable = true)
 |-- reports_to: integer (nullable = true)
 |-- job_Title: string (nullable = true)



In [0]:
df_employees.limit(5).display()

employee_number,last_name,first_name,extension,email,office_code,reports_to,job_Title
1002,Murphy,Diane,x5800,dmurphy@classicmodelcars.com,1,,President
1056,Patterson,Mary,x4611,mpatterso@classicmodelcars.com,1,1002.0,VP Sales
1076,Firrelli,Jeff,x9273,jfirrelli@classicmodelcars.com,1,1002.0,VP Marketing
1088,Patterson,William,x4871,wpatterson@classicmodelcars.com,6,1056.0,Sales Manager (APAC)
1102,Bondur,Gerard,x5408,gbondur@classicmodelcars.com,4,1056.0,Sale Manager (EMEA)


In [0]:
verificar_dados_nulos(df_employees)

employee_number,last_name,first_name,extension,email,office_code,reports_to,job_Title
0,0,0,0,0,0,1,0


- Coluna <i><b>reports_to</b></i>: Substituindo os valor null por 0, pois o Presidente não reporta para nenhum outro colaborador

In [0]:
df_employees = df_employees.na.fill({'reports_to': 0})

In [0]:
df_employees.write.mode('overwrite').format('delta').option('mergeSchema', 'True').save('/join/silver/df_employees')

#### Offices

In [0]:
df_offices = spark.read.format('delta').load('/join/bronze/df_offices')

exibir_info_df(df_offices)


Linhas = 7 
Colunas = 9

root
 |-- office_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- address_line1: string (nullable = true)
 |-- address_line2: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- territory: string (nullable = true)



In [0]:
df_offices.display()

office_code,city,phone,address_line1,address_line2,state,country,postal_code,territory
1,San Francisco11,+1 650 219 4782,100 Market Street,Suite 300,CA,USA,94080,
2,Boston,+1 215 837 0825,1550 Court Place,Suite 102,MA,USA,02107,
3,NYC1,+1 212 555 3000,523 East 53rd Street,apt. 5A,NY,USA,10022,
4,Paris,+33 14 723 4404,43 Rue Jouffroy D'abbans,,,France,75017,EMEA
5,Tokyo,+81 33 224 5000,4-1 Kioicho,,Chiyoda-Ku,Japan,102-8578,Japan
6,Sydney1,+61 2 9264 2451,5-11 Wentworth Avenue,Floor #2,,Australia,NSW 2010,APAC
7,London1,+44 20 7877 2041,25 Old Broad Street,Level 7,,UK,EC2N 1HN,EMEA


In [0]:
verificar_dados_nulos(df_offices)

office_code,city,phone,address_line1,address_line2,state,country,postal_code,territory
0,0,0,0,2,3,0,0,0


- Coluna <b>address_line2</b>: Substituindo os valores nulos por Não informado
 - Coluna <b>state</b>: Substituindo os valores nulos por capital
 - Coluna <b>city</b>: Retirando o caracter '1' do nome de algumas cidades

In [0]:
df_offices = df_offices.na.fill({'address_line2': 'Uninformed', 'state': 'capital'})

In [0]:
df_offices = df_offices.withColumn('city', regexp_replace('city', '1', ''))

In [0]:
df_offices.display()

office_code,city,phone,address_line1,address_line2,state,country,postal_code,territory
1,San Francisco,+1 650 219 4782,100 Market Street,Suite 300,CA,USA,94080,
2,Boston,+1 215 837 0825,1550 Court Place,Suite 102,MA,USA,02107,
3,NYC,+1 212 555 3000,523 East 53rd Street,apt. 5A,NY,USA,10022,
4,Paris,+33 14 723 4404,43 Rue Jouffroy D'abbans,Uninformed,capital,France,75017,EMEA
5,Tokyo,+81 33 224 5000,4-1 Kioicho,Uninformed,Chiyoda-Ku,Japan,102-8578,Japan
6,Sydney,+61 2 9264 2451,5-11 Wentworth Avenue,Floor #2,capital,Australia,NSW 2010,APAC
7,London,+44 20 7877 2041,25 Old Broad Street,Level 7,capital,UK,EC2N 1HN,EMEA


In [0]:
df_offices.write.mode('overwrite').format('delta').option('mergeSchema', 'True').save('/join/silver/df_offices')

#### Order Details

In [0]:
df_orderdetails = spark.read.format('delta').load('/join/bronze/df_orderdetails')

exibir_info_df(df_orderdetails)



Linhas = 2997 
Colunas = 5

root
 |-- order_number: integer (nullable = true)
 |-- product_code: string (nullable = true)
 |-- quantity_ordered: integer (nullable = true)
 |-- price_each: decimal(10,2) (nullable = true)
 |-- order_line_number: short (nullable = true)



In [0]:
df_orderdetails.limit(5).display()

order_number,product_code,quantity_ordered,price_each,order_line_number
10100,S18_1749,30,136.0,3
10100,S18_2248,50,55.09,2
10100,S18_4409,22,75.46,4
10100,S24_3969,49,35.29,1
10101,S18_2325,25,108.06,4


In [0]:
verificar_dados_nulos(df_orderdetails)

order_number,product_code,quantity_ordered,price_each,order_line_number
0,0,0,0,0


In [0]:
df_orderdetails.write.mode('overwrite').format('delta').option('mergeSchema', 'True').save('/join/silver/df_orderdetails')

#### Orders

In [0]:
df_orders = spark.read.format('delta').load('/join/bronze/df_orders')

exibir_info_df(df_orders)



Linhas = 329 
Colunas = 7

root
 |-- order_number: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- required_date: date (nullable = true)
 |-- shipped_date: date (nullable = true)
 |-- status: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- customer_number: integer (nullable = true)



In [0]:
df_orders.limit(5).display()

order_number,order_date,required_date,shipped_date,status,comments,customer_number
10100,2003-01-06,2003-01-13,2003-01-10,Resolved,,363
10101,2003-01-09,2003-01-18,2003-01-11,Shipped,Check on availability.,128
10102,2003-01-10,2003-01-18,2003-01-14,Shipped,,181
10103,2003-01-29,2003-02-07,2003-02-02,Shipped,,121
10104,2003-01-31,2003-02-09,2003-02-01,Shipped,,141


In [0]:
verificar_dados_nulos(df_orders)

order_number,order_date,required_date,shipped_date,status,comments,customer_number
0,0,0,10,0,247,0


- Coluna <b><i>comments</b></i>: Substituindo os valores nulos por Sem comentários (No comments)

In [0]:
df_orders = df_orders.na.fill({'comments': 'No comments'})
df_orders = df_orders.withColumn('comments', when(df_orders.comments == '', 'No comments').otherwise(df_orders.comments))

- Coluna <b><i>shipped_date</b></i>: Substituindo os valores nulos com uma data padrão que indica o status da ordem como Cancelada ou em Espera

In [0]:
df_orders.filter('shipped_date is Null').display()

order_number,order_date,required_date,shipped_date,status,comments,customer_number
10167,2003-10-23,2003-10-30,,Cancelled,Customer called to cancel. The warehouse was notified in time and the order didn't ship. They have a new VP of Sales and are shifting their sales model. Our VP of Sales should contact them.,448
10248,2004-05-07,2004-05-14,,Cancelled,Order was mistakenly placed. The warehouse noticed the lack of documentation.,131
10260,2004-06-16,2004-06-22,,Cancelled,Customer heard complaints from their customers and called to cancel this order. Will notify the Sales Manager.,357
10262,2004-06-24,2004-07-01,,Cancelled,This customer found a better offer from one of our competitors. Will call back to renegotiate.,141
10334,2004-11-19,2004-11-28,,On Hold,The outstaniding balance for this customer exceeds their credit limit. Order will be shipped when a payment is received.,144
10401,2005-04-03,2005-04-14,,On Hold,Customer credit limit exceeded. Will ship when a payment is received.,328
10407,2005-04-22,2005-05-04,,On Hold,Customer credit limit exceeded. Will ship when a payment is received.,450
10414,2005-05-06,2005-05-13,,On Hold,Customer credit limit exceeded. Will ship when a payment is received.,362
10420,2005-05-29,2005-06-07,,Resolved,No comments,282
10421,2005-05-29,2005-06-06,,In Process,Custom shipping instructions were sent to warehouse,124


In [0]:
df_orders = df_orders.na.fill({'shipped_date': '2023-01-01'})

In [0]:
verificar_dados_nulos(df_orders)

order_number,order_date,required_date,shipped_date,status,comments,customer_number
0,0,0,0,0,0,0


In [0]:
df_orders.write.mode('overwrite').format('delta').option('mergeSchema', 'True').save('/join/silver/df_orders')

#### Payments

In [0]:
df_payments = spark.read.format('delta').load('/join/bronze/df_payments')

exibir_info_df(df_payments)



Linhas = 273 
Colunas = 4

root
 |-- customer_number: integer (nullable = true)
 |-- check_number: string (nullable = true)
 |-- payment_date: date (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)



In [0]:
df_payments.limit(5).display()

customer_number,check_number,payment_date,amount
103,HQ336336,2004-10-19,6066.78
103,JM555205,2003-06-05,14571.44
103,OM314933,2004-12-18,1676.14
112,BO864823,2004-12-17,14191.12
112,HQ55022,2003-06-06,32641.98


In [0]:
verificar_dados_nulos(df_payments)

customer_number,check_number,payment_date,amount
0,0,0,0


In [0]:
df_payments.write.mode('overwrite').format('delta').option('mergeSchema', 'True').save('/join/silver/df_payments')

#### Product Lines

In [0]:

df_product_lines = spark.read.format('delta').load('/join/bronze/df_product_lines')

exibir_info_df(df_product_lines)



Linhas = 7 
Colunas = 4

root
 |-- product_line: string (nullable = true)
 |-- text_description: string (nullable = true)
 |-- html_description: string (nullable = true)
 |-- image: binary (nullable = true)



In [0]:
df_product_lines.display()

product_line,text_description,html_description,image
Classic Cars,"Attention car enthusiasts: Make your wildest car ownership dreams come true. Whether you are looking for classic muscle cars, dream sports cars or movie-inspired miniatures, you will find great choices in this category. These replicas feature superb attention to detail and craftsmanship and offer features such as working steering system, opening forward compartment, opening rear trunk with removable spare wheel, 4-wheel independent spring suspension, and so on. The models range in size from 1:10 to 1:24 scale and include numerous limited edition and several out-of-production vehicles. All models include a certificate of authenticity from their manufacturers and come fully assembled and ready for display in the home or office.",,
Motorcycles,"Our motorcycles are state of the art replicas of classic as well as contemporary motorcycle legends such as Harley Davidson, Ducati and Vespa. Models contain stunning details such as official logos, rotating wheels, working kickstand, front suspension, gear-shift lever, footbrake lever, and drive chain. Materials used include diecast and plastic. The models range in size from 1:10 to 1:50 scale and include numerous limited edition and several out-of-production vehicles. All models come fully assembled and ready for display in the home or office. Most include a certificate of authenticity.",,
Planes,"Unique, diecast airplane and helicopter replicas suitable for collections, as well as home, office or classroom decorations. Models contain stunning details such as official logos and insignias, rotating jet engines and propellers, retractable wheels, and so on. Most come fully assembled and with a certificate of authenticity from their manufacturers.",,
Ships,"The perfect holiday or anniversary gift for executives, clients, friends, and family. These handcrafted model ships are unique, stunning works of art that will be treasured for generations! They come fully assembled and ready for display in the home or office. We guarantee the highest quality, and best value.",,
Trains,"Model trains are a rewarding hobby for enthusiasts of all ages. Whether you're looking for collectible wooden trains, electric streetcars or locomotives, you'll find a number of great choices for any budget within this category. The interactive aspect of trains makes toy trains perfect for young children. The wooden train sets are ideal for children under the age of 5.",,
Trucks and Buses,"The Truck and Bus models are realistic replicas of buses and specialized trucks produced from the early 1920s to present. The models range in size from 1:12 to 1:50 scale and include numerous limited edition and several out-of-production vehicles. Materials used include tin, diecast and plastic. All models include a certificate of authenticity from their manufacturers and are a perfect ornament for the home and office.",,
Vintage Cars,"Our Vintage Car models realistically portray automobiles produced from the early 1900s through the 1940s. Materials used include Bakelite, diecast, plastic and wood. Most of the replicas are in the 1:18 and 1:24 scale sizes, which provide the optimum in detail and accuracy. Prices range from $30.00 up to $180.00 for some special limited edition replicas. All models include a certificate of authenticity from their manufacturers and come fully assembled and ready for display in the home or office.",,


In [0]:
verificar_dados_nulos(df_product_lines)

product_line,text_description,html_description,image
0,0,7,7


- Coluna <b><i>html_description</b></i>: Remoção da coluna
- Coluna <b><i>image</b></i>: Remoção da coluna

In [0]:
df_product_lines.write.mode('overwrite').format('delta').option('mergeSchema', 'True').save('/join/silver/df_product_lines')

#### Products

In [0]:

df_products = spark.read.format('delta').load('/join/bronze/df_products')

exibir_info_df(df_products)




Linhas = 110 
Colunas = 9

root
 |-- product_code: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_line: string (nullable = true)
 |-- product_scale: string (nullable = true)
 |-- product_vendor: string (nullable = true)
 |-- product_description: string (nullable = true)
 |-- quantity_in_stock: short (nullable = true)
 |-- buy_price: decimal(10,2) (nullable = true)
 |-- msrp: decimal(10,2) (nullable = true)



In [0]:
df_products.limit(5).display()

product_code,product_name,product_line,product_scale,product_vendor,product_description,quantity_in_stock,buy_price,msrp
S10_1678,1969 Harley Davidson Ultimate Chopper,Motorcycles,1:10,Min Lin Diecast,"This replica features working kickstand, front suspension, gear-shift lever, footbrake lever, drive chain, wheels and steering. All parts are particularly delicate due to their precise scale and require special care and attention.",7933,48.81,95.7
S10_1949,Alpine Renault 1300,Classic Cars,1:10,Classic Metal Creations,Turnable front wheels; steering function; detailed interior; detailed engine; opening hood; opening trunk; opening doors; and detailed chassis.,7305,98.58,214.3
S10_2016,1996 Moto Guzzi 1100i,Motorcycles,1:10,Highway 66 Mini Classics,"Official Moto Guzzi logos and insignias, saddle bags located on side of motorcycle, detailed engine, working steering, working suspension, two leather seats, luggage rack, dual exhaust pipes, small saddle bag located on handle bars, two-tone paint with chrome accents, superior die-cast detail , rotating wheels , working kick stand, diecast metal with plastic parts and baked enamel finish.",6625,68.99,118.94
S10_4698,2003 Harley-Davidson Eagle Drag Bike,Motorcycles,1:10,Red Start Diecast,"Model features, official Harley Davidson logos and insignias, detachable rear wheelie bar, heavy diecast metal with resin parts, authentic multi-color tampo-printed graphics, separate engine drive belts, free-turning front fork, rotating tires and rear racing slick, certificate of authenticity, detailed engine, display stand\r\n, precision diecast replica, baked enamel finish, 1:10 scale model, removable fender, seat and tank cover piece for displaying the superior detail of the v-twin engine",5582,91.02,193.66
S10_4757,1972 Alfa Romeo GTA,Classic Cars,1:10,Motor City Art Classics,Features include: Turnable front wheels; steering function; detailed interior; detailed engine; opening hood; opening trunk; opening doors; and detailed chassis.,3252,85.68,136.0


In [0]:
verificar_dados_nulos(df_products)

product_code,product_name,product_line,product_scale,product_vendor,product_description,quantity_in_stock,buy_price,msrp
0,0,0,0,0,0,0,0,0


In [0]:
df_products.write.mode('overwrite').format('delta').option('mergeSchema', 'True').save('/join/silver/df_products')