In [1]:
import pandas as pd
import numpy as np

# Load dataset into a DataFrame
df = pd.read_csv('DataCoSupplyChainDataset.csv', encoding='latin-1')

# Convert order and shipping dates to datetime
df['order date (DateOrders)'] = pd.to_datetime(df['order date (DateOrders)'])
df['shipping date (DateOrders)'] = pd.to_datetime(df['shipping date (DateOrders)'])

# Sort the dataframe by 'Order Id' and 'order date (DateOrders)'
df.sort_values(by=['Order Id', 'order date (DateOrders)'], inplace=True)

# Aggregate by 'Order Id' and keep the first value for relevant columns
agg_df = df.groupby('Order Id').agg({
    'order date (DateOrders)': 'first',
    'shipping date (DateOrders)': 'first',
    'Customer Id': 'first',
    'Customer Segment': 'first',    # Keep the first Customer Segment value
    'Department Id': 'first',       # Keep the first Department Id value
    'Shipping Mode': 'first',
    'Delivery Status': 'first'
}).reset_index()

# Define the revised state mapping
state_mapping = {
    'Late delivery': ['Processing', 'In Transit', 'Delivered'],
    'Shipping on time': ['Processing', 'In Transit', 'Delivered'],
    'Shipping canceled': ['Processing', 'In Transit', 'Cancelled'],
    'Advance shipping': ['Processing', 'In Transit', 'Delivered']
}

# Define the function to calculate the intervals for each shipping mode
def get_intervals(shipping_mode, total_duration):
    if shipping_mode == 'Same Day':
        return [total_duration // 3, total_duration // 3, total_duration // 3]
    else:
        return [total_duration // 3, total_duration // 3, total_duration // 3]

# Define a function to create the new records
def create_shipment_records(row):
    order_date = row['order date (DateOrders)']
    shipping_date = row['shipping date (DateOrders)']
    order_id = 'O' + str(row['Order Id'])  # Add prefix 'O' to order ID
    customer_id = 'C' + str(row['Customer Id'])  # Add prefix 'C' to customer ID
    shipping_mode = row['Shipping Mode']
    customer_segment = row['Customer Segment']  # Include Customer Segment
    department_id = row['Department Id']  # Include Department Id
    
    # Determine the states based on the original delivery status
    states = state_mapping[row['Delivery Status']]
    
    # Calculate the total duration
    if shipping_mode == 'Same Day':
        total_duration = (shipping_date - order_date).total_seconds() / 3600  # Duration in hours
        intervals = get_intervals(shipping_mode, total_duration)
        date_range = pd.date_range(start=order_date, end=shipping_date, periods=len(states))
    else:
        total_duration = (shipping_date - order_date).days  # Duration in days
        intervals = get_intervals(shipping_mode, total_duration)
        date_range = pd.date_range(start=order_date, end=shipping_date, periods=len(states))
    
    # Ensure the last date is the shipping date
    date_range = list(date_range)
    date_range[-1] = shipping_date
    
    # Create new records
    new_records = []
    for i, state in enumerate(states):
        record = {
            'Order Id': order_id,
            'Customer Id': customer_id,
            'Customer Segment': customer_segment,  # Include Customer Segment
            'Department Id': department_id,        # Include Department Id
            'Shipping Mode': shipping_mode,
            'Date': date_range[i],
            'Processing': state == 'Processing',
            'In Transit': state == 'In Transit',
            'Cancelled': state == 'Cancelled',
            'Delivered': state == 'Delivered'
        }
        new_records.append(record)
    
    return new_records

# Apply the function to each row and create the new DataFrame
shipment_records = agg_df.apply(create_shipment_records, axis=1)
shipment_records = [record for sublist in shipment_records for record in sublist]  # Flatten the list of lists

# Create the final DataFrame
new_df = pd.DataFrame(shipment_records)

# Display the first few rows of the new DataFrame
print(new_df.head())

  Order Id Customer Id Customer Segment  Department Id   Shipping Mode  \
0       O1      C11599         Consumer              7  Standard Class   
1       O1      C11599         Consumer              7  Standard Class   
2       O1      C11599         Consumer              7  Standard Class   
3       O2        C256         Consumer              7  Standard Class   
4       O2        C256         Consumer              7  Standard Class   

                 Date  Processing  In Transit  Cancelled  Delivered  
0 2015-01-01 00:00:00        True       False      False      False  
1 2015-01-02 00:00:00       False        True      False      False  
2 2015-01-03 00:00:00       False       False      False       True  
3 2015-01-01 00:21:00        True       False      False      False  
4 2015-01-02 12:21:00       False        True      False      False  


In [2]:
new_df[new_df['Shipping Mode'] == 'Standard Class'].head(10)

Unnamed: 0,Order Id,Customer Id,Customer Segment,Department Id,Shipping Mode,Date,Processing,In Transit,Cancelled,Delivered
0,O1,C11599,Consumer,7,Standard Class,2015-01-01 00:00:00,True,False,False,False
1,O1,C11599,Consumer,7,Standard Class,2015-01-02 00:00:00,False,True,False,False
2,O1,C11599,Consumer,7,Standard Class,2015-01-03 00:00:00,False,False,False,True
3,O2,C256,Consumer,7,Standard Class,2015-01-01 00:21:00,True,False,False,False
4,O2,C256,Consumer,7,Standard Class,2015-01-02 12:21:00,False,True,False,False
5,O2,C256,Consumer,7,Standard Class,2015-01-04 00:21:00,False,False,False,True
6,O4,C8827,Home Office,6,Standard Class,2015-01-01 01:03:00,True,False,False,False
7,O4,C8827,Home Office,6,Standard Class,2015-01-03 13:03:00,False,True,False,False
8,O4,C8827,Home Office,6,Standard Class,2015-01-06 01:03:00,False,False,False,True
9,O5,C11318,Consumer,4,Standard Class,2015-01-01 01:24:00,True,False,False,False


In [3]:
new_df[new_df['Shipping Mode'] == 'First Class'].head(10)

Unnamed: 0,Order Id,Customer Id,Customer Segment,Department Id,Shipping Mode,Date,Processing,In Transit,Cancelled,Delivered
54,O21,C2711,Consumer,2,First Class,2015-01-01 07:00:00,True,False,False,False
55,O21,C2711,Consumer,2,First Class,2015-01-02 07:00:00,False,True,False,False
56,O21,C2711,Consumer,2,First Class,2015-01-03 07:00:00,False,False,False,True
81,O33,C5793,Consumer,4,First Class,2015-01-01 11:12:00,True,False,False,False
82,O33,C5793,Consumer,4,First Class,2015-01-02 11:12:00,False,True,False,False
83,O33,C5793,Consumer,4,First Class,2015-01-03 11:12:00,False,False,False,True
84,O34,C4189,Consumer,7,First Class,2015-01-01 11:33:00,True,False,False,False
85,O34,C4189,Consumer,7,First Class,2015-01-02 11:33:00,False,True,False,False
86,O34,C4189,Consumer,7,First Class,2015-01-03 11:33:00,False,False,False,True
102,O41,C8136,Home Office,7,First Class,2015-01-01 14:00:00,True,False,False,False


In [4]:
new_df[(new_df['Shipping Mode'] == 'Second Class')].iloc[618:622,:]

Unnamed: 0,Order Id,Customer Id,Customer Segment,Department Id,Shipping Mode,Date,Processing,In Transit,Cancelled,Delivered
2985,O1179,C3326,Corporate,5,Second Class,2015-01-18 04:42:00,True,False,False,False
2986,O1179,C3326,Corporate,5,Second Class,2015-01-20 16:42:00,False,True,False,False
2987,O1179,C3326,Corporate,5,Second Class,2015-01-23 04:42:00,False,False,False,True
2997,O1184,C5580,Home Office,7,Second Class,2015-01-18 06:27:00,True,False,False,False


In [5]:
new_df[(new_df['Shipping Mode'] == 'Same Day')].iloc[1068:1075,:]

Unnamed: 0,Order Id,Customer Id,Customer Segment,Department Id,Shipping Mode,Date,Processing,In Transit,Cancelled,Delivered
19668,O7857,C5185,Corporate,5,Same Day,2015-04-25 16:18:00,True,False,False,False
19669,O7857,C5185,Corporate,5,Same Day,2015-04-25 22:18:00,False,True,False,False
19670,O7857,C5185,Corporate,5,Same Day,2015-04-26 04:18:00,False,False,False,True
19671,O7858,C9095,Consumer,5,Same Day,2015-04-25 16:39:00,True,False,False,False
19672,O7858,C9095,Consumer,5,Same Day,2015-04-25 22:39:00,False,True,False,False
19673,O7858,C9095,Consumer,5,Same Day,2015-04-26 04:39:00,False,False,False,True
19875,O7934,C7859,Home Office,4,Same Day,2015-04-26 19:17:00,True,False,False,False


In [None]:
new_df['Department Id'] = 'D' + new_df['Department Id'].astype(str)

In [None]:
new_df.head()

In [6]:
len(new_df)

197256

In [7]:
new_df.to_excel('ProcessedShipmentFact.xlsx', index=False)

print("Shipment Fact table saved successfully")

Shipment Fact table saved successfully
