<a href="https://colab.research.google.com/github/AndresMontesDeOca/Zubale/blob/main/Mid_LevelDataEngineer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0 Libreries

In [86]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

# 1 Challenge 1: CSV File Manipulation

## 1.2 Load Files

In [92]:
# Products
data_products = pd.read_csv('products.csv')

# Rename Index
data_products.rename(columns={'id': 'product_id'}, inplace=True)

print(data_products.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   product_id  20 non-null     int64  
 1   name        20 non-null     object 
 2   category    20 non-null     object 
 3   price       20 non-null     float64
dtypes: float64(1), int64(1), object(2)
memory usage: 772.0+ bytes
None


In [88]:
# Orders
data_orders = pd.read_csv('orders.csv', parse_dates=['created_date'])

# Rename Index
data_orders.rename(columns={'id': 'order_id'}, inplace=True)

print(data_orders.info(), '\b')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   order_id      50 non-null     int64         
 1   product_id    50 non-null     int64         
 2   quantity      50 non-null     int64         
 3   created_date  50 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 1.7 KB
None 


## 1.3 EDA Products

In [89]:
# Dataframe overview
display(data_products.head())
display(data_products.tail())

# Product ID relates with Product Name

Unnamed: 0,product_id,name,category,price
0,1,Product_1,Pants,92.55
1,2,Product_2,Shirts,43.11
2,3,Product_3,Jackets,59.02
3,4,Product_4,Shoes,49.65
4,5,Product_5,Pants,44.59


Unnamed: 0,product_id,name,category,price
15,16,Product_16,Dresses,38.08
16,17,Product_17,Shirts,98.51
17,18,Product_18,Jackets,10.11
18,19,Product_19,Shirts,51.94
19,20,Product_20,Jackets,37.85


In [90]:
# Not repeated products
print('Product id unique?:', data_products.index.is_unique)
print('Product name unique?:', data_products['name'].is_unique, '\n')

# Categories
print(data_products['category'].value_counts())

Product id unique?: True
Product name unique?: True 

category
Shirts     6
Pants      4
Jackets    4
Shoes      3
Dresses    3
Name: count, dtype: int64


## 1.4 EDA Orders

In [91]:
# Dataframe overview
display(data_orders.head())

Unnamed: 0,order_id,product_id,quantity,created_date
0,1,11,1,2024-12-01
1,2,17,2,2024-12-01
2,3,19,1,2024-12-01
3,4,12,1,2024-12-01
4,5,11,2,2024-12-01


In [54]:
# Not repeated products
print('Order id unique?:', data_orders.index.is_unique)

# Each order relates with a single Product ID

Order id unique?: True


## 1.5 Merge

In [98]:
# Merge the DataFrames using product_id and index
data_merged = pd.merge(data_orders, data_products, left_on='product_id', right_on='product_id')

# Total Price Calculation
data_merged['total_price'] = data_merged['price'] * data_merged['quantity']

# Columns Rename
data_merged.rename(columns={'created_date': 'order_created_date', 'name':'product_name'}, inplace=True)

# Final Dataframe
data_final = data_merged[['order_created_date', 'order_id', 'product_name', 'quantity', 'total_price']]

display(data_final.head())

Unnamed: 0,order_created_date,order_id,product_name,quantity,total_price
0,2024-12-01,1,Product_11,1,69.06
1,2024-12-01,2,Product_17,2,197.02
2,2024-12-01,3,Product_19,1,51.94
3,2024-12-01,4,Product_12,1,50.99
4,2024-12-01,5,Product_11,2,138.12


## 1.6 Export to CSV

In [99]:
data_final.to_csv('order_full_information.csv', index=False)