In [1]:
import pandas as pd
import numpy as np
import uuid
import random
from datetime import timedelta

open_orders = dict()
event_types = ['order_created', 'order_fulfilled']
periods = [ts for ts in pd.date_range(start='1/1/2008', end='31/12/2018') if random.random()>0.8]

def new_random_order():
    return {
        'order_id': uuid.uuid4(),
        'amount': random.random() * 1000,
        'created_at': random.choice(periods)
    } 

def new_random_event():
    event_type = random.choice(event_types)
    order_info = new_random_order()
    if event_type=='order_created':
        open_orders[order_info['order_id']] = order_info
    else:
        if len(open_orders):
            order_id = random.choice(list(open_orders.keys()))
            order_info = open_orders[order_id]
            order_info['created_at'] = order_info['created_at'] + timedelta(random.randrange(0,180))
        open_orders.pop(order_info['order_id'], None)
        
    return {
        "event_id": uuid.uuid4(),
        "event_type": event_type,
        "order_id": order_info['order_id'],
        "amount": order_info['amount'],
        "created_at": order_info['created_at']
    }

df = pd.DataFrame([new_random_event() for _ in range(0, 100)])
df

Unnamed: 0,event_id,event_type,order_id,amount,created_at
0,f8e349de-76d7-443f-bd0b-0cfc2a86bdd4,order_fulfilled,7720be7a-8b80-4ed7-86f9-7553f7e82899,993.215476,2009-07-02
1,bf9a0790-e7ad-4848-a085-50e7ab5b7ad9,order_fulfilled,8382caf3-d347-4484-b170-b0d2d33cecec,259.342769,2014-06-19
2,e4e69272-82f3-426f-91a7-f56a2db36d49,order_created,ad1c85df-9bd0-4ab2-b5b0-1a7392dbbdfa,954.594327,2010-09-07
3,391e3205-a62d-49e0-a71c-c8af4d2ccb92,order_fulfilled,ad1c85df-9bd0-4ab2-b5b0-1a7392dbbdfa,954.594327,2010-10-11
4,485e35bf-56d7-4409-9c2d-c123744d1c34,order_created,7d0dec50-709a-4ee6-a9a9-c549eca66636,740.740429,2010-06-13
...,...,...,...,...,...
95,4dd2b552-5f32-4a66-b8d4-d7800a958b0e,order_fulfilled,a75ee4f0-b3be-41e7-8827-b242f9b83a93,791.165922,2018-04-14
96,5f09aca8-da06-4abd-ba8d-d74bcbc0b586,order_fulfilled,9f066b8d-cca2-44cc-9afb-1ba4f7816065,156.367702,2017-01-10
97,8a98059b-f27e-43f7-9a9e-28ac612df7f7,order_fulfilled,0bf4ed13-8d4e-44ec-81bb-6fc15d9470d5,667.351500,2008-03-31
98,72b1590f-0d44-45cb-af1f-74e3407d695a,order_created,b5c1470c-c9ff-4c5c-a266-a4788a1186d7,186.988844,2008-01-19


## Q1. List all events with "event_type" as "order_created" where "order_fulfilled" is at least 30 days later than the "order_created" (dates are based on "created_at").


In [2]:
#Creating separate dataframes for order_created and order_fulfilled
Created_df=df.loc[ df["event_type"] == "order_created"]

Fulfilled_df= df.loc[df["event_type"] == "order_fulfilled"]

In [3]:
#Merging the separated dataframes together
new_merged_df=pd.merge(Created_df, Fulfilled_df, on = ["order_id","amount"], how = "inner")

new_merged_df

Unnamed: 0,event_id_x,event_type_x,order_id,amount,created_at_x,event_id_y,event_type_y,created_at_y
0,e4e69272-82f3-426f-91a7-f56a2db36d49,order_created,ad1c85df-9bd0-4ab2-b5b0-1a7392dbbdfa,954.594327,2010-09-07,391e3205-a62d-49e0-a71c-c8af4d2ccb92,order_fulfilled,2010-10-11
1,485e35bf-56d7-4409-9c2d-c123744d1c34,order_created,7d0dec50-709a-4ee6-a9a9-c549eca66636,740.740429,2010-06-13,e5fb7a5b-81e9-4578-bf5d-df70c46e2118,order_fulfilled,2010-07-18
2,9ec5185b-28fe-48c9-8738-ef2a06812c2f,order_created,933aa650-6643-4ab0-9dd6-d33bc9cc85ef,680.612859,2016-07-08,0ee9ab2c-3d0e-4bf5-bd97-df4d89da6e08,order_fulfilled,2016-08-01
3,398393ff-bc3e-46fc-9361-03dba9863d83,order_created,f00b2831-6418-473e-bc66-f54fec5a4635,146.312325,2010-01-24,13998892-ca9c-4d88-afe7-b43abc7b002d,order_fulfilled,2010-02-05
4,75d8ccb3-3e14-4080-a806-43df95289481,order_created,ed462576-8e26-4355-8c10-e8b9caa632d6,260.71539,2012-10-04,febcdfcb-1c61-4c6b-a253-abfe25b52286,order_fulfilled,2013-02-05
5,9898be49-613a-4ff8-9dcd-4f3d48bab80d,order_created,353f7238-1a39-438d-80e3-cf6d4134d0f3,933.103569,2018-07-26,df86d7ed-3d83-4aff-9cad-388a9a526fa1,order_fulfilled,2018-11-27
6,9ccb8d39-4670-45b1-a9b7-ab5bd42eb935,order_created,713bb1cb-d751-4ef4-9619-dd00316d1290,929.416258,2012-02-06,3f62acd3-26ee-42c1-a596-64c369f92173,order_fulfilled,2012-06-06
7,b840277a-b43e-4f93-8a23-6196b161d77a,order_created,a1277e1d-8ea5-4331-b42d-84ed07b53a6b,455.44105,2010-01-16,4f541257-a69e-4132-83f9-bbf06b5a78d4,order_fulfilled,2010-07-08
8,5c9c2c90-5cba-4ca5-aa6f-701b6f4525bb,order_created,bef8a7fe-cb56-48e0-adc0-2ff67bed519f,886.367861,2011-09-11,d6bda02a-8ae1-4e82-939f-b82498b86062,order_fulfilled,2011-12-24
9,587c8d72-4699-43fe-943b-1df46b863da5,order_created,516975fd-2d7e-4db5-b654-8077a66e20c8,746.532258,2013-01-30,743b8835-ff1b-4f3f-80d3-c61f7443862d,order_fulfilled,2013-06-11


In [4]:
#Getting delay time 
new_merged_df["delay"] = (new_merged_df["created_at_y"] - new_merged_df["created_at_x"]) / np.timedelta64(1, "D")
new_merged_df["delay"] = new_merged_df["delay"].astype(int)

new_merged_df

Unnamed: 0,event_id_x,event_type_x,order_id,amount,created_at_x,event_id_y,event_type_y,created_at_y,delay
0,e4e69272-82f3-426f-91a7-f56a2db36d49,order_created,ad1c85df-9bd0-4ab2-b5b0-1a7392dbbdfa,954.594327,2010-09-07,391e3205-a62d-49e0-a71c-c8af4d2ccb92,order_fulfilled,2010-10-11,34
1,485e35bf-56d7-4409-9c2d-c123744d1c34,order_created,7d0dec50-709a-4ee6-a9a9-c549eca66636,740.740429,2010-06-13,e5fb7a5b-81e9-4578-bf5d-df70c46e2118,order_fulfilled,2010-07-18,35
2,9ec5185b-28fe-48c9-8738-ef2a06812c2f,order_created,933aa650-6643-4ab0-9dd6-d33bc9cc85ef,680.612859,2016-07-08,0ee9ab2c-3d0e-4bf5-bd97-df4d89da6e08,order_fulfilled,2016-08-01,24
3,398393ff-bc3e-46fc-9361-03dba9863d83,order_created,f00b2831-6418-473e-bc66-f54fec5a4635,146.312325,2010-01-24,13998892-ca9c-4d88-afe7-b43abc7b002d,order_fulfilled,2010-02-05,12
4,75d8ccb3-3e14-4080-a806-43df95289481,order_created,ed462576-8e26-4355-8c10-e8b9caa632d6,260.71539,2012-10-04,febcdfcb-1c61-4c6b-a253-abfe25b52286,order_fulfilled,2013-02-05,124
5,9898be49-613a-4ff8-9dcd-4f3d48bab80d,order_created,353f7238-1a39-438d-80e3-cf6d4134d0f3,933.103569,2018-07-26,df86d7ed-3d83-4aff-9cad-388a9a526fa1,order_fulfilled,2018-11-27,124
6,9ccb8d39-4670-45b1-a9b7-ab5bd42eb935,order_created,713bb1cb-d751-4ef4-9619-dd00316d1290,929.416258,2012-02-06,3f62acd3-26ee-42c1-a596-64c369f92173,order_fulfilled,2012-06-06,121
7,b840277a-b43e-4f93-8a23-6196b161d77a,order_created,a1277e1d-8ea5-4331-b42d-84ed07b53a6b,455.44105,2010-01-16,4f541257-a69e-4132-83f9-bbf06b5a78d4,order_fulfilled,2010-07-08,173
8,5c9c2c90-5cba-4ca5-aa6f-701b6f4525bb,order_created,bef8a7fe-cb56-48e0-adc0-2ff67bed519f,886.367861,2011-09-11,d6bda02a-8ae1-4e82-939f-b82498b86062,order_fulfilled,2011-12-24,104
9,587c8d72-4699-43fe-943b-1df46b863da5,order_created,516975fd-2d7e-4db5-b654-8077a66e20c8,746.532258,2013-01-30,743b8835-ff1b-4f3f-80d3-c61f7443862d,order_fulfilled,2013-06-11,132


In [5]:
#Renamong columns and removing unnecessary columns

new_merged_df.rename(columns= {"created_at_x": "Order_date", "created_at_y": "Fulfilled_date"}, inplace = True)

new_merged_df=new_merged_df.drop(["event_type_x" , "event_id_x" , "event_id_y","event_type_y"],axis=1)




In [6]:
new_merged_df

Unnamed: 0,order_id,amount,Order_date,Fulfilled_date,delay
0,ad1c85df-9bd0-4ab2-b5b0-1a7392dbbdfa,954.594327,2010-09-07,2010-10-11,34
1,7d0dec50-709a-4ee6-a9a9-c549eca66636,740.740429,2010-06-13,2010-07-18,35
2,933aa650-6643-4ab0-9dd6-d33bc9cc85ef,680.612859,2016-07-08,2016-08-01,24
3,f00b2831-6418-473e-bc66-f54fec5a4635,146.312325,2010-01-24,2010-02-05,12
4,ed462576-8e26-4355-8c10-e8b9caa632d6,260.71539,2012-10-04,2013-02-05,124
5,353f7238-1a39-438d-80e3-cf6d4134d0f3,933.103569,2018-07-26,2018-11-27,124
6,713bb1cb-d751-4ef4-9619-dd00316d1290,929.416258,2012-02-06,2012-06-06,121
7,a1277e1d-8ea5-4331-b42d-84ed07b53a6b,455.44105,2010-01-16,2010-07-08,173
8,bef8a7fe-cb56-48e0-adc0-2ff67bed519f,886.367861,2011-09-11,2011-12-24,104
9,516975fd-2d7e-4db5-b654-8077a66e20c8,746.532258,2013-01-30,2013-06-11,132


In [7]:
#filtering orders with delay time < 30

df_30=new_merged_df.loc[new_merged_df["delay"] >= 30]
df_30

Unnamed: 0,order_id,amount,Order_date,Fulfilled_date,delay
0,ad1c85df-9bd0-4ab2-b5b0-1a7392dbbdfa,954.594327,2010-09-07,2010-10-11,34
1,7d0dec50-709a-4ee6-a9a9-c549eca66636,740.740429,2010-06-13,2010-07-18,35
4,ed462576-8e26-4355-8c10-e8b9caa632d6,260.71539,2012-10-04,2013-02-05,124
5,353f7238-1a39-438d-80e3-cf6d4134d0f3,933.103569,2018-07-26,2018-11-27,124
6,713bb1cb-d751-4ef4-9619-dd00316d1290,929.416258,2012-02-06,2012-06-06,121
7,a1277e1d-8ea5-4331-b42d-84ed07b53a6b,455.44105,2010-01-16,2010-07-08,173
8,bef8a7fe-cb56-48e0-adc0-2ff67bed519f,886.367861,2011-09-11,2011-12-24,104
9,516975fd-2d7e-4db5-b654-8077a66e20c8,746.532258,2013-01-30,2013-06-11,132
11,da196065-982c-4319-b5f5-57ed09925f87,822.953208,2014-09-14,2014-12-13,90
12,436feb8c-3aee-429a-aeaf-301d23e4f100,943.861502,2008-11-20,2008-12-21,31


In [8]:
#converting df_30 to dictionary

result=df_30.to_dict()

result

{'order_id': {0: UUID('ad1c85df-9bd0-4ab2-b5b0-1a7392dbbdfa'),
  1: UUID('7d0dec50-709a-4ee6-a9a9-c549eca66636'),
  4: UUID('ed462576-8e26-4355-8c10-e8b9caa632d6'),
  5: UUID('353f7238-1a39-438d-80e3-cf6d4134d0f3'),
  6: UUID('713bb1cb-d751-4ef4-9619-dd00316d1290'),
  7: UUID('a1277e1d-8ea5-4331-b42d-84ed07b53a6b'),
  8: UUID('bef8a7fe-cb56-48e0-adc0-2ff67bed519f'),
  9: UUID('516975fd-2d7e-4db5-b654-8077a66e20c8'),
  11: UUID('da196065-982c-4319-b5f5-57ed09925f87'),
  12: UUID('436feb8c-3aee-429a-aeaf-301d23e4f100'),
  13: UUID('f995a4af-9162-46af-b841-6dbc85282121'),
  14: UUID('6bab98e8-a338-49bc-b5ea-3d977f5264f1'),
  15: UUID('3d5cd9a7-c8bf-4a9c-85ca-93fa9336972c'),
  16: UUID('74edff44-ce1c-4be9-b9fd-4a1fc1ad8355'),
  17: UUID('2de3bb61-7ee6-4b07-bd99-c83561ece683'),
  18: UUID('4a95cd62-ff79-4f0b-82e8-cf0a790f41e4'),
  19: UUID('c9085fd6-9bf0-42f6-a6a3-7f514d725993'),
  20: UUID('7548556a-e929-48ff-8f0e-81a2563904d4'),
  21: UUID('f0823950-a667-4a72-9f0a-a44576c74215'),
  22: UU

## Q2. Determine the average, minimum, maximum and count delay per month per calendar year where the delay is greater than 30 days.

In [9]:
df_30['delay'].describe()

count     37.000000
mean     105.135135
std       43.627949
min       31.000000
25%       74.000000
50%      104.000000
75%      139.000000
max      173.000000
Name: delay, dtype: float64

In [10]:
#getting year,month from df_30

df_30["Fulfilled_year"] = df_30["Fulfilled_date"].dt.year
df_30["Fulfilled_month"] = df_30["Fulfilled_date"].dt.month
df_30

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_30["Fulfilled_year"] = df_30["Fulfilled_date"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_30["Fulfilled_month"] = df_30["Fulfilled_date"].dt.month


Unnamed: 0,order_id,amount,Order_date,Fulfilled_date,delay,Fulfilled_year,Fulfilled_month
0,ad1c85df-9bd0-4ab2-b5b0-1a7392dbbdfa,954.594327,2010-09-07,2010-10-11,34,2010,10
1,7d0dec50-709a-4ee6-a9a9-c549eca66636,740.740429,2010-06-13,2010-07-18,35,2010,7
4,ed462576-8e26-4355-8c10-e8b9caa632d6,260.71539,2012-10-04,2013-02-05,124,2013,2
5,353f7238-1a39-438d-80e3-cf6d4134d0f3,933.103569,2018-07-26,2018-11-27,124,2018,11
6,713bb1cb-d751-4ef4-9619-dd00316d1290,929.416258,2012-02-06,2012-06-06,121,2012,6
7,a1277e1d-8ea5-4331-b42d-84ed07b53a6b,455.44105,2010-01-16,2010-07-08,173,2010,7
8,bef8a7fe-cb56-48e0-adc0-2ff67bed519f,886.367861,2011-09-11,2011-12-24,104,2011,12
9,516975fd-2d7e-4db5-b654-8077a66e20c8,746.532258,2013-01-30,2013-06-11,132,2013,6
11,da196065-982c-4319-b5f5-57ed09925f87,822.953208,2014-09-14,2014-12-13,90,2014,12
12,436feb8c-3aee-429a-aeaf-301d23e4f100,943.861502,2008-11-20,2008-12-21,31,2008,12


In [11]:
result1 = df_30.groupby(["Fulfilled_year", "Fulfilled_month"]).agg({"delay": ["mean", "min", "max", "count"]})
result1

Unnamed: 0_level_0,Unnamed: 1_level_0,delay,delay,delay,delay
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,count
Fulfilled_year,Fulfilled_month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2008,10,89.0,89,89,1
2008,12,31.0,31,31,1
2009,2,89.0,89,89,1
2009,5,74.0,74,74,1
2010,3,148.0,148,148,1
2010,7,103.25,35,173,4
2010,10,34.0,34,34,1
2011,12,104.0,104,104,1
2012,4,166.0,166,166,1
2012,6,121.0,121,121,1
