In [1]:
import pandas as pd

#### Step 1: Load and Verify the Cleaned Dataset

*Objective: *Ensure the cleaned dataset is ready for analysis by loading it and verifying the presence of required columns.**

In [2]:
root_path = "../Data_Clean/Data_Cleaned/"

data = pd.read_csv(root_path + 'merged_olist_dataset.csv')

**1. Define required columns**

In [3]:
required_columns = [
    'order_purchase_timestamp', 'payment_value', 'payment_type', 'order_status',
    'price', 'freight_value', 'order_approved_at', 'order_delivered_customer_date',
    'order_estimated_delivery_date', 'order_id'
]

**2. Check for missing columns**

In [4]:
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All required columns are present.")

# Display basic info to verify data integrity
print(data.info())

All required columns are present.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119143 entries, 0 to 119142
Data columns (total 57 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   order_id                       119143 non-null  object 
 1   customer_id                    119143 non-null  object 
 2   order_status                   119143 non-null  object 
 3   order_purchase_timestamp       119143 non-null  object 
 4   order_approved_at              118966 non-null  object 
 5   order_delivered_carrier_date   117057 non-null  object 
 6   order_delivered_customer_date  115722 non-null  object 
 7   order_estimated_delivery_date  119143 non-null  object 
 8   delivery_time_days             115722 non-null  float64
 9   order_item_id                  118310 non-null  float64
 10  product_id                     118310 non-null  object 
 11  seller_id                      118310 non-null  object 
 

---
---

#### Step 2: Handle Data Types and Final Cleaning
**Objective: *Ensure data types are correct and perform any final cleaning to prepare for analysis.***

*1. Convert date columns to datetime*

In [6]:
date_columns = [
    'order_purchase_timestamp', 'order_approved_at',
    'order_delivered_customer_date', 'order_estimated_delivery_date'
]
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')

**2. Check for missing values in critical columns**

In [7]:
critical_columns = ['order_id', 'order_status', 'price', 'freight_value', 'order_purchase_timestamp']
print("Missing values in critical columns:\n", data[critical_columns].isnull().sum())

Missing values in critical columns:
 order_id                      0
order_status                  0
price                       833
freight_value               833
order_purchase_timestamp      0
dtype: int64


**3. Handle missing values**

In [8]:
data = data.dropna(subset=critical_columns)

**4. Create total_order_value column**

In [9]:
data['total_order_value'] = data['price'] + data['freight_value']

**5. Verify the new column**

In [10]:
print(data[['price', 'freight_value', 'total_order_value']].head())

    price  freight_value  total_order_value
0   29.99           8.72              38.71
1   29.99           8.72              38.71
2   29.99           8.72              38.71
3  118.70          22.76             141.46
4  159.90          19.22             179.12


---
---

#### Step 3: Perform Reconciliation Criteria Calculations
**Objective: *Calculate sales performance metrics as per the challenge requirements.***

**1. Total Revenue (delivered orders)**

In [11]:
total_revenue = data[data['order_status'] == 'delivered']['total_order_value'].sum()

**2. Expected Revenue (approved orders)**

In [12]:
expected_revenue = data[data['order_approved_at'].notnull()]['total_order_value'].sum()

**3. Canceled Orders**

In [13]:
canceled_orders = data[data['order_status'] == 'canceled'].shape[0]

**4. Late Deliveries**

In [14]:
data['is_late'] = data['order_delivered_customer_date'] > data['order_estimated_delivery_date']
late_deliveries = data[data['is_late']].shape[0]

**Print results**

In [15]:
print(f"Total Revenue: {total_revenue:.2f}")
print(f"Expected Revenue: {expected_revenue:.2f}")
print(f"Canceled Orders: {canceled_orders}")
print(f"Late Deliveries: {late_deliveries}")

Total Revenue: 16188779.23
Expected Revenue: 16641776.70
Canceled Orders: 570
Late Deliveries: 9068


---
---