In [1]:
import numpy as np
import pandas as pd
import uuid
import random
from datetime import timedelta

In [2]:
open_orders = dict()
event_types = ['order_created', 'order_fulfilled']
periods = [ts for ts in pd.date_range(start='1/1/2008', end='31/12/2018') if random.random()>0.8]

In [3]:
def new_random_order():
    return {'order_id': uuid.uuid4(),
            'amount': random.random() * 1000,
            'created_at': random.choice(periods)} 

In [4]:
def new_random_event():
    event_type = random.choice(event_types)
    order_info = new_random_order()
    if event_type=='order_created':
        open_orders[order_info['order_id']] = order_info
    else:
        if len(open_orders):
            order_id = random.choice(list(open_orders.keys()))
            order_info = open_orders[order_id]
            order_info['created_at'] = order_info['created_at'] + timedelta(random.randrange(0,180))
        open_orders.pop(order_info['order_id'], None)
        
    return {"event_id": uuid.uuid4(),
            "event_type": event_type,
            "order_id": order_info['order_id'],
            "amount": order_info['amount'],
            "created_at": order_info['created_at']}

In [5]:
df = pd.DataFrame([new_random_event() for _ in range(0, 100)])

In [6]:
df.head()

Unnamed: 0,event_id,event_type,order_id,amount,created_at
0,8062355a-23f4-4b14-8034-d997874f6f07,order_fulfilled,850b9094-79cf-4477-bb4e-0ac9ff216cb0,433.527827,2013-01-21
1,808a9cdf-bb74-4296-a8ba-8787bd09db0b,order_created,eb33363f-62ee-425d-9174-28f812269d2f,882.737114,2009-08-03
2,5a8dd9cb-e254-449c-8c93-e78254fb4745,order_created,ad2ea0d5-ed66-4d5f-a334-f18c07c2f9ee,440.740871,2008-06-25
3,2310fce1-7863-48ab-8fb9-66f9d13ade7f,order_created,28cff0c9-27cd-4d60-8489-0608388cf83f,858.093322,2017-11-19
4,824d8ef7-72ec-43db-9d5b-778edc4ab7d7,order_created,1e4c5e6a-7d3e-4d28-ac56-ae4c4133b917,494.250091,2017-01-01


In [7]:
df.shape

(100, 5)

Q1. List all events with "event_type" as "order_created" where "order_fulfilled" is at least 30 days later than the "order_created" (dates are based on "created_at").

In [8]:
#Creating a new dataframe for order_created and fulfillefd event type
created_df = df.loc[df["event_type"] == "order_created"]
fulfilled_df = df.loc[df["event_type"] == "order_fulfilled"]

In [9]:
print(created_df.head(), '\n' , created_df.shape)

                               event_id     event_type  \
1  808a9cdf-bb74-4296-a8ba-8787bd09db0b  order_created   
2  5a8dd9cb-e254-449c-8c93-e78254fb4745  order_created   
3  2310fce1-7863-48ab-8fb9-66f9d13ade7f  order_created   
4  824d8ef7-72ec-43db-9d5b-778edc4ab7d7  order_created   
9  2586858e-f32c-4762-8b8d-80856991803f  order_created   

                               order_id      amount created_at  
1  eb33363f-62ee-425d-9174-28f812269d2f  882.737114 2009-08-03  
2  ad2ea0d5-ed66-4d5f-a334-f18c07c2f9ee  440.740871 2008-06-25  
3  28cff0c9-27cd-4d60-8489-0608388cf83f  858.093322 2017-11-19  
4  1e4c5e6a-7d3e-4d28-ac56-ae4c4133b917  494.250091 2017-01-01  
9  98fb367b-24ae-4819-8e13-217b8992ede5  892.752257 2017-05-22   
 (53, 5)


In [10]:
print(fulfilled_df.head(), '\n' , fulfilled_df.shape)

                               event_id       event_type  \
0  8062355a-23f4-4b14-8034-d997874f6f07  order_fulfilled   
5  40c0e153-566a-4720-9c03-5e023e2c27c9  order_fulfilled   
6  ef3ba005-1c87-443f-ac99-ecc0038db073  order_fulfilled   
7  d48fbfa0-750b-4d42-865b-33fdeed88548  order_fulfilled   
8  28858538-87b5-4cd6-8079-2170f1d6c3a3  order_fulfilled   

                               order_id      amount created_at  
0  850b9094-79cf-4477-bb4e-0ac9ff216cb0  433.527827 2013-01-21  
5  28cff0c9-27cd-4d60-8489-0608388cf83f  858.093322 2017-12-13  
6  eb33363f-62ee-425d-9174-28f812269d2f  882.737114 2009-08-25  
7  ad2ea0d5-ed66-4d5f-a334-f18c07c2f9ee  440.740871 2008-08-27  
8  1e4c5e6a-7d3e-4d28-ac56-ae4c4133b917  494.250091 2017-02-16   
 (47, 5)


In [11]:
#Merging both dataframes together to form a new dataframe

merged_df = pd.merge(created_df, fulfilled_df, on = ["order_id"])

In [12]:
merged_df.head()

Unnamed: 0,event_id_x,event_type_x,order_id,amount_x,created_at_x,event_id_y,event_type_y,amount_y,created_at_y
0,808a9cdf-bb74-4296-a8ba-8787bd09db0b,order_created,eb33363f-62ee-425d-9174-28f812269d2f,882.737114,2009-08-03,ef3ba005-1c87-443f-ac99-ecc0038db073,order_fulfilled,882.737114,2009-08-25
1,5a8dd9cb-e254-449c-8c93-e78254fb4745,order_created,ad2ea0d5-ed66-4d5f-a334-f18c07c2f9ee,440.740871,2008-06-25,d48fbfa0-750b-4d42-865b-33fdeed88548,order_fulfilled,440.740871,2008-08-27
2,2310fce1-7863-48ab-8fb9-66f9d13ade7f,order_created,28cff0c9-27cd-4d60-8489-0608388cf83f,858.093322,2017-11-19,40c0e153-566a-4720-9c03-5e023e2c27c9,order_fulfilled,858.093322,2017-12-13
3,824d8ef7-72ec-43db-9d5b-778edc4ab7d7,order_created,1e4c5e6a-7d3e-4d28-ac56-ae4c4133b917,494.250091,2017-01-01,28858538-87b5-4cd6-8079-2170f1d6c3a3,order_fulfilled,494.250091,2017-02-16
4,2586858e-f32c-4762-8b8d-80856991803f,order_created,98fb367b-24ae-4819-8e13-217b8992ede5,892.752257,2017-05-22,6350a4d4-ec31-4ae6-9170-0735108196f4,order_fulfilled,892.752257,2017-05-27


In [13]:
merged_df.shape

(33, 9)

In [14]:
#Computing the delayed time and storing it as an integer
merged_df["delayed_time"] = (merged_df["created_at_y"] - merged_df["created_at_x"])
merged_df["delayed_time"] = (merged_df["delayed_time"])/ np.timedelta64(1, "D")
merged_df["delayed_time"] = merged_df["delayed_time"].astype(int)

merged_df

Unnamed: 0,event_id_x,event_type_x,order_id,amount_x,created_at_x,event_id_y,event_type_y,amount_y,created_at_y,delayed_time
0,808a9cdf-bb74-4296-a8ba-8787bd09db0b,order_created,eb33363f-62ee-425d-9174-28f812269d2f,882.737114,2009-08-03,ef3ba005-1c87-443f-ac99-ecc0038db073,order_fulfilled,882.737114,2009-08-25,22
1,5a8dd9cb-e254-449c-8c93-e78254fb4745,order_created,ad2ea0d5-ed66-4d5f-a334-f18c07c2f9ee,440.740871,2008-06-25,d48fbfa0-750b-4d42-865b-33fdeed88548,order_fulfilled,440.740871,2008-08-27,63
2,2310fce1-7863-48ab-8fb9-66f9d13ade7f,order_created,28cff0c9-27cd-4d60-8489-0608388cf83f,858.093322,2017-11-19,40c0e153-566a-4720-9c03-5e023e2c27c9,order_fulfilled,858.093322,2017-12-13,24
3,824d8ef7-72ec-43db-9d5b-778edc4ab7d7,order_created,1e4c5e6a-7d3e-4d28-ac56-ae4c4133b917,494.250091,2017-01-01,28858538-87b5-4cd6-8079-2170f1d6c3a3,order_fulfilled,494.250091,2017-02-16,46
4,2586858e-f32c-4762-8b8d-80856991803f,order_created,98fb367b-24ae-4819-8e13-217b8992ede5,892.752257,2017-05-22,6350a4d4-ec31-4ae6-9170-0735108196f4,order_fulfilled,892.752257,2017-05-27,5
5,439110f1-1e3e-4796-905c-d11fefbd9472,order_created,edffbd5d-5d68-434f-9d62-ed245ee6141b,821.681154,2018-12-15,cbb2bd3a-929f-4344-9cda-07d8210f3a80,order_fulfilled,821.681154,2019-04-27,133
6,a8e38d2d-5220-476c-9465-908b7c14e548,order_created,fcd383f5-08bf-4936-a192-a6b310f334b1,761.541548,2011-12-14,12fe7e99-3cc7-4163-b12e-a8d8b717cb81,order_fulfilled,761.541548,2012-05-01,139
7,abe28e04-a644-4a97-8ba2-28d94bd311d5,order_created,a72e277f-a651-446c-9428-1a84a1dd4730,141.663843,2008-01-28,17968f7c-d30d-4c84-8495-ad22b37b3b4c,order_fulfilled,141.663843,2008-06-18,142
8,b4af99cc-8afe-499a-a4f7-06ec21b0c804,order_created,221304de-bbd0-44f1-9a46-44d5fa00b2c7,195.16401,2010-11-10,79050113-0505-4150-aba2-aa5d4bcb5e40,order_fulfilled,195.16401,2011-03-04,114
9,abbc8b8e-25c4-47dc-8350-121e3e5242b4,order_created,b035b4b1-a2f4-4c9c-87cd-ebf659db4288,275.944331,2017-01-15,af8a82fb-4b01-4d87-b58c-8c87b462e8a3,order_fulfilled,275.944331,2017-02-13,29


In [15]:
#Deleting unwanted columns
columns_to_drop = ["event_id_x", "event_type_x", "event_id_y", "event_type_y", "amount_x",
                  "amount_y"]
merged_df = merged_df.drop(labels= columns_to_drop, axis=1)

In [16]:
#Renaming appropraitely for when orders where created and when they were fulfilled
merged_df.rename(columns= {"created_at_x": "order_date", 
                           "created_at_y": "order_fulfilled"}, inplace = True)

#storing only colums with delayed_time>= 30
df_30 = merged_df.loc[merged_df["delayed_time"] >= 30]
df_30

Unnamed: 0,order_id,order_date,order_fulfilled,delayed_time
1,ad2ea0d5-ed66-4d5f-a334-f18c07c2f9ee,2008-06-25,2008-08-27,63
3,1e4c5e6a-7d3e-4d28-ac56-ae4c4133b917,2017-01-01,2017-02-16,46
5,edffbd5d-5d68-434f-9d62-ed245ee6141b,2018-12-15,2019-04-27,133
6,fcd383f5-08bf-4936-a192-a6b310f334b1,2011-12-14,2012-05-01,139
7,a72e277f-a651-446c-9428-1a84a1dd4730,2008-01-28,2008-06-18,142
8,221304de-bbd0-44f1-9a46-44d5fa00b2c7,2010-11-10,2011-03-04,114
11,528930bb-4bd4-4202-abb2-20f19d669ed0,2017-11-19,2018-05-04,166
12,9166a8f1-2ba4-4528-b8ca-a4983e366759,2008-10-14,2008-11-27,44
13,1e865ade-65df-483b-8781-a5468e2b43fe,2016-10-11,2017-03-26,166
14,7d28c7bc-d4b2-4677-b284-a6a4836a4dff,2016-08-12,2016-09-18,37


In [17]:
#Converting the delayed time into a list
df_30_dict = df_30.to_dict()

Q2: Determine the average, minimum, maximum and count delay per month per calendar year where the delay is greater than 30 days.

In [18]:
df_30.describe()

Unnamed: 0,delayed_time
count,26.0
mean,114.384615
std,44.800962
min,37.0
25%,79.5
50%,115.0
75%,153.25
max,177.0


In [19]:
#Extracting the months and the year from the ordered date
df_30["delayed_month"] = df_30['order_date'].dt.month
df_30["delayed_year"] = df_30['order_date'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [20]:
df_30

Unnamed: 0,order_id,order_date,order_fulfilled,delayed_time,delayed_month,delayed_year
1,ad2ea0d5-ed66-4d5f-a334-f18c07c2f9ee,2008-06-25,2008-08-27,63,6,2008
3,1e4c5e6a-7d3e-4d28-ac56-ae4c4133b917,2017-01-01,2017-02-16,46,1,2017
5,edffbd5d-5d68-434f-9d62-ed245ee6141b,2018-12-15,2019-04-27,133,12,2018
6,fcd383f5-08bf-4936-a192-a6b310f334b1,2011-12-14,2012-05-01,139,12,2011
7,a72e277f-a651-446c-9428-1a84a1dd4730,2008-01-28,2008-06-18,142,1,2008
8,221304de-bbd0-44f1-9a46-44d5fa00b2c7,2010-11-10,2011-03-04,114,11,2010
11,528930bb-4bd4-4202-abb2-20f19d669ed0,2017-11-19,2018-05-04,166,11,2017
12,9166a8f1-2ba4-4528-b8ca-a4983e366759,2008-10-14,2008-11-27,44,10,2008
13,1e865ade-65df-483b-8781-a5468e2b43fe,2016-10-11,2017-03-26,166,10,2016
14,7d28c7bc-d4b2-4677-b284-a6a4836a4dff,2016-08-12,2016-09-18,37,8,2016


In [21]:
#Grouping the delayed_time metrics by delayed_year and delayed_month
df_30_agg = df_30.groupby(["delayed_year", "delayed_month"]).agg(
    {"delayed_time": ["mean", "min", "max", "count"]})
df_30_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,delayed_time,delayed_time,delayed_time,delayed_time
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,count
delayed_year,delayed_month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2008,1,142.0,142,142,1
2008,6,63.0,63,63,1
2008,10,44.0,44,44,1
2008,12,173.0,173,173,1
2009,1,105.0,105,105,1
2009,3,56.0,56,56,1
2009,9,169.0,161,177,2
2010,11,114.0,114,114,1
2010,12,103.0,103,103,1
2011,12,139.0,139,139,1
