In [2]:
import numpy as np
import pandas as pd

from datetime import datetime
from tqdm import tqdm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


##### Import Dataset  &  Preprocessing


In [14]:
df = pd.read_csv('datasets/order_brush_order.csv')
df.shape
df.head()
df.isnull().sum()

for c in df.columns:
    if c != 'event_time':
        print(f'{c} has {df[c].unique().shape[0]} unique values.\n')
        
# type transformation
df['orderid'] = df['orderid'].astype(str)
df['shopid'] = df['shopid'].astype(str)
df['userid'] = df['userid'].astype(str)
print(f'type of event_time: {type(df.event_time.values[0])}')
df['event_time'] = pd.to_datetime(df['event_time'])
print(f'type of event_time: {type(df.event_time.values[0])}')



(222750, 4)

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


orderid       0
shopid        0
userid        0
event_time    0
dtype: int64

orderid has 222750 unique values.

shopid has 18770 unique values.

userid has 93883 unique values.

type of event_time: <class 'str'>
type of event_time: <class 'numpy.datetime64'>


#### Get orders per shop per user within 1hr

In [15]:
df = df.set_index(pd.DatetimeIndex(df['event_time'])).drop(['event_time'], axis=1).sort_index()
order_byshop_byuser = df.groupby(['shopid','userid', pd.Grouper(freq='H', label='left', base=0)]).count()
order_byshop_byuser.shape
order_byshop_byuser.head()

brush_order = order_byshop_byuser.query('orderid >= 3')
brush_order.shape
brush_order.head()


The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  


(215877, 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,orderid
shopid,userid,event_time,Unnamed: 3_level_1
100028340,170326390,2019-12-31 10:00:00,1
100028604,46481617,2019-12-28 18:00:00,1
100028732,116420312,2019-12-30 09:00:00,1
100028732,153538082,2019-12-27 02:00:00,1
100028732,27847434,2019-12-27 16:00:00,1


(742, 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,orderid
shopid,userid,event_time,Unnamed: 3_level_1
100446829,2434757,2019-12-28 10:00:00,4
10159,214988798,2019-12-29 20:00:00,3
10199219,8405753,2019-12-31 14:00:00,3
10206302,95058664,2019-12-29 01:00:00,4
102111330,59440029,2019-12-27 15:00:00,3


#### Collect users brushed order

In [18]:
suspicious_users = list()
brush_order.reset_index().groupby(['shopid'])['userid'].apply(lambda x: suspicious_users.append(x.values))
suspicious_users[:10]

submit_users = list()
for item in suspicious_users:
    submit_users.append('&'.join(str(x) for x in item))
submit_users[:10]

shopid
100446829    None
10159        None
10199219     None
10206302     None
102111330    None
             ... 
9676950      None
98481320     None
99067259     None
99833363     None
99836251     None
Name: userid, Length: 296, dtype: object

[array(['2434757'], dtype=object),
 array(['214988798'], dtype=object),
 array(['8405753'], dtype=object),
 array(['95058664'], dtype=object),
 array(['59440029'], dtype=object),
 array(['198097381', '214226569'], dtype=object),
 array(['214226569'], dtype=object),
 array(['77819'], dtype=object),
 array(['188187242'], dtype=object),
 array(['80682958'], dtype=object)]

['2434757',
 '214988798',
 '8405753',
 '95058664',
 '59440029',
 '198097381&214226569',
 '214226569',
 '77819',
 '188187242',
 '80682958']

#### Output submission file

In [21]:
df_output = pd.DataFrame({
    'shopid': brush_order.reset_index().shopid.unique(),
    'userid': submit_users
})
df_output.shape
df_output.head()

df_origin = pd.DataFrame({
    'shopid': df.shopid.unique(),
    'userid': 0
})

df_output = pd.concat([df_origin[~df_origin['shopid'].isin(df_output.shopid.unique())], df_output])
df_output.shape
df_output.head()
df_output.to_csv('output/Week1-submission.csv', index=False)


(296, 2)

Unnamed: 0,shopid,userid
0,100446829,2434757
1,10159,214988798
2,10199219,8405753
3,10206302,95058664
4,102111330,59440029


(18770, 2)

Unnamed: 0,shopid,userid
0,6042309,0
1,104804492,0
3,190969466,0
4,2859407,0
5,94479614,0
