In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (12,6)

## Read file

In [135]:
df = pd.read_csv("data/monday.csv", sep = ';')

In [37]:
df.shape

(4884, 3)

In [15]:
df.head(20)

Unnamed: 0,timestamp,customer_no,location
0,2019-09-02 07:03:00,1,dairy
1,2019-09-02 07:03:00,2,dairy
2,2019-09-02 07:04:00,3,dairy
3,2019-09-02 07:04:00,4,dairy
4,2019-09-02 07:04:00,5,spices
5,2019-09-02 07:04:00,6,spices
6,2019-09-02 07:04:00,7,spices
7,2019-09-02 07:04:00,8,fruit
8,2019-09-02 07:05:00,1,checkout
9,2019-09-02 07:05:00,5,checkout


In [136]:
df['timestamp'] = pd.to_datetime(df.timestamp)
df.set_index('timestamp', inplace = True)

## See example customer and resample

In [17]:
df[df['customer_no'] == 7]

Unnamed: 0_level_0,customer_no,location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-09-02 07:04:00,7,spices
2019-09-02 07:05:00,7,drinks
2019-09-02 07:09:00,7,spices
2019-09-02 07:11:00,7,fruit
2019-09-02 07:13:00,7,checkout


In [137]:
df[df['customer_no'] == 7].resample('T').ffill()

Unnamed: 0_level_0,customer_no,location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-09-02 07:04:00,7,spices
2019-09-02 07:05:00,7,drinks
2019-09-02 07:06:00,7,drinks
2019-09-02 07:07:00,7,drinks
2019-09-02 07:08:00,7,drinks
2019-09-02 07:09:00,7,spices
2019-09-02 07:10:00,7,spices
2019-09-02 07:11:00,7,fruit
2019-09-02 07:12:00,7,fruit
2019-09-02 07:13:00,7,checkout


## Resample all customers and forward-fill

In [138]:
df = df.groupby(['customer_no']).resample('T').ffill()

In [41]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_no,location
customer_no,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2019-09-02 07:03:00,1,dairy
1,2019-09-02 07:04:00,1,dairy
1,2019-09-02 07:05:00,1,checkout
2,2019-09-02 07:03:00,2,dairy
2,2019-09-02 07:04:00,2,dairy
...,...,...,...
1444,2019-09-02 21:48:00,1444,spices
1444,2019-09-02 21:49:00,1444,checkout
1445,2019-09-02 21:49:00,1445,dairy
1446,2019-09-02 21:50:00,1446,dairy


In [139]:
df = df.drop(columns = ['customer_no']).reset_index().set_index('timestamp')

## Create 'before' column by shifting

In [140]:
df['before'] = df['location'].shift(1)
df

Unnamed: 0_level_0,customer_no,location,before
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-02 07:03:00,1,dairy,
2019-09-02 07:04:00,1,dairy,dairy
2019-09-02 07:05:00,1,checkout,dairy
2019-09-02 07:03:00,2,dairy,checkout
2019-09-02 07:04:00,2,dairy,dairy
...,...,...,...
2019-09-02 21:48:00,1444,spices,dairy
2019-09-02 21:49:00,1444,checkout,spices
2019-09-02 21:49:00,1445,dairy,checkout
2019-09-02 21:50:00,1446,dairy,dairy


## Check dataframe

In [56]:
df.head(20)

Unnamed: 0_level_0,customer_no,location,before
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-02 07:03:00,1,dairy,
2019-09-02 07:04:00,1,dairy,dairy
2019-09-02 07:05:00,1,checkout,dairy
2019-09-02 07:03:00,2,dairy,checkout
2019-09-02 07:04:00,2,dairy,dairy
2019-09-02 07:05:00,2,dairy,dairy
2019-09-02 07:06:00,2,checkout,dairy
2019-09-02 07:04:00,3,dairy,checkout
2019-09-02 07:05:00,3,dairy,dairy
2019-09-02 07:06:00,3,checkout,dairy


## Remove checkouts in 'before' column

In [142]:
#df = df[df['before'] != 'checkout']
df['before'] = df.apply(lambda row: 'entrance' if row['before'] == 'checkout' else row['before'], axis = 1)

In [144]:
df['before'].iloc[0] = 'entrance'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [145]:
df.head(10)

Unnamed: 0_level_0,customer_no,location,before
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-02 07:03:00,1,dairy,entrance
2019-09-02 07:04:00,1,dairy,dairy
2019-09-02 07:05:00,1,checkout,dairy
2019-09-02 07:03:00,2,dairy,entrance
2019-09-02 07:04:00,2,dairy,dairy
2019-09-02 07:05:00,2,dairy,dairy
2019-09-02 07:06:00,2,checkout,dairy
2019-09-02 07:04:00,3,dairy,entrance
2019-09-02 07:05:00,3,dairy,dairy
2019-09-02 07:06:00,3,checkout,dairy


## Create first location column

In [63]:
df['first_location'] = df.groupby(['customer_no']).location.transform('first')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [66]:
df.head(50)

Unnamed: 0_level_0,customer_no,location,before,first_location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-09-02 07:03:00,1,dairy,,dairy
2019-09-02 07:04:00,1,dairy,dairy,dairy
2019-09-02 07:05:00,1,checkout,dairy,dairy
2019-09-02 07:04:00,2,dairy,dairy,dairy
2019-09-02 07:05:00,2,dairy,dairy,dairy
2019-09-02 07:06:00,2,checkout,dairy,dairy
2019-09-02 07:05:00,3,dairy,dairy,dairy
2019-09-02 07:06:00,3,checkout,dairy,dairy
2019-09-02 07:05:00,4,dairy,dairy,dairy
2019-09-02 07:06:00,4,dairy,dairy,dairy


## 252 customers go directly to checkout

In [70]:
df.groupby(['first_location']).count()

Unnamed: 0_level_0,customer_no,location,before
first_location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
checkout,252,252,252
dairy,3369,3369,3368
drinks,1775,1775,1775
fruit,2765,2765,2765
spices,1167,1167,1167


In [74]:
np.sum(df[df['first_location'] == 'checkout'].groupby(['customer_no']).count().location > 1)

0

## Remove checkout customers

In [76]:
df = df[df['first_location'] != 'checkout']

## Create following location column

In [78]:
df['timestamp'] = df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [80]:
df['first_location_time'] = df.groupby(['customer_no']).timestamp.transform('first')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [81]:
df

Unnamed: 0_level_0,customer_no,location,before,first_location,timestamp,first_location_time
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-02 07:03:00,1,dairy,,dairy,2019-09-02 07:03:00,2019-09-02 07:03:00
2019-09-02 07:04:00,1,dairy,dairy,dairy,2019-09-02 07:04:00,2019-09-02 07:03:00
2019-09-02 07:05:00,1,checkout,dairy,dairy,2019-09-02 07:05:00,2019-09-02 07:03:00
2019-09-02 07:04:00,2,dairy,dairy,dairy,2019-09-02 07:04:00,2019-09-02 07:04:00
2019-09-02 07:05:00,2,dairy,dairy,dairy,2019-09-02 07:05:00,2019-09-02 07:04:00
...,...,...,...,...,...,...
2019-09-02 21:48:00,1443,dairy,fruit,dairy,2019-09-02 21:48:00,2019-09-02 21:48:00
2019-09-02 21:48:00,1444,spices,dairy,spices,2019-09-02 21:48:00,2019-09-02 21:48:00
2019-09-02 21:49:00,1444,checkout,spices,spices,2019-09-02 21:49:00,2019-09-02 21:48:00
2019-09-02 21:50:00,1446,dairy,dairy,dairy,2019-09-02 21:50:00,2019-09-02 21:50:00


In [88]:
np.sum(df[df['first_location_time'] == df.index].groupby('customer_no').location.count() > 1)

0

In [111]:
df['following_location'] = df.apply(lambda row: row['location'] if row['first_location_time'] != row.timestamp else '', axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Compare group probabilities

In [115]:
df.groupby('first_location').location.count() / df.location.count()

first_location
dairy     0.371199
drinks    0.195571
fruit     0.304650
spices    0.128581
Name: location, dtype: float64

In [116]:
df.groupby('following_location').location.count() / df.location.count()

following_location
            0.131446
checkout    0.130564
dairy       0.298810
drinks      0.170119
fruit       0.182129
spices      0.086933
Name: location, dtype: float64

## Create transition probabilities

In [147]:
P = pd.crosstab(df['location'], df['before'], normalize=0)
P

before,dairy,drinks,entrance,fruit,spices
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
checkout,0.215727,0.296451,0.0,0.364649,0.123173
dairy,0.744206,0.006581,0.11731,0.064378,0.067525
drinks,0.106497,0.61065,0.11236,0.064485,0.106009
fruit,0.070758,0.072713,0.202111,0.607506,0.046912
spices,0.150651,0.131922,0.228827,0.099349,0.389251


In [131]:
from random import choices

STATES = ['checkout', 'dairy', 'drinks', 'fruit', 'spices']

def mcmc(state, transition_probs):
    """runs a Monte-Carlo Markov-Chain simulation on the supermarket with a checkout as the terminal state."""
    history = [state]
    while state != 'checkout':
        probs = list(P[state])
        state = choices(STATES, probs)[0]
        history.append(state)
    return history

In [148]:
mcmc('entrance', P)

['entrance',
 'fruit',
 'fruit',
 'fruit',
 'fruit',
 'spices',
 'spices',
 'checkout']

In [152]:
markov_chain = []

for i in range(1000):
    
    markov_chain.append(mcmc('entrance', P))

In [157]:
df_markov = pd.DataFrame(markov_chain)

In [177]:
df_markov

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,entrance,spices,spices,drinks,drinks,drinks,spices,spices,checkout,,...,,,,,,,,,,
1,entrance,spices,dairy,checkout,,,,,,,...,,,,,,,,,,
2,entrance,fruit,dairy,dairy,checkout,,,,,,...,,,,,,,,,,
3,entrance,drinks,checkout,,,,,,,,...,,,,,,,,,,
4,entrance,dairy,dairy,fruit,spices,spices,spices,spices,dairy,dairy,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,entrance,dairy,dairy,dairy,dairy,checkout,,,,,...,,,,,,,,,,
996,entrance,fruit,checkout,,,,,,,,...,,,,,,,,,,
997,entrance,fruit,fruit,fruit,fruit,drinks,checkout,,,,...,,,,,,,,,,
998,entrance,fruit,drinks,checkout,,,,,,,...,,,,,,,,,,


In [188]:
df_markov.groupby(8).count()[0]

8
checkout    32
dairy       39
drinks      51
fruit       33
spices      49
Name: 0, dtype: int64