# 1. Load libraries and data

In [1]:
#!pwd

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import timedelta
import os
pd.options.mode.chained_assignment = None 

## Append csv files

In [3]:
def add_dataframes():
    df = pd.DataFrame()
    for file_name in os.listdir('../data/'):
        df_weekday = pd.read_csv('../data/'+file_name, delimiter=";")
        #########
        df=pd.concat([df,df_weekday],axis=0).reset_index(drop=True)
    df=df.sort_values(['timestamp'],axis = 0, ascending = True).reset_index(drop=True)
    return df

In [4]:
df_raw=add_dataframes()
df_raw

Unnamed: 0,timestamp,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout"
0,2019-09-02 07:03:00,1.0,dairy,
1,2019-09-02 07:03:00,2.0,dairy,
2,2019-09-02 07:04:00,3.0,dairy,
3,2019-09-02 07:04:00,4.0,dairy,
4,2019-09-02 07:04:00,5.0,spices,
...,...,...,...,...
24878,,,,"0.0,0.7370226646420336,0.05134694336651482,0.0..."
24879,,,,"0.0,0.19321439949028352,0.4021981522777955,0.1..."
24880,,,,"0.0,0.010899742930591259,0.08699228791773779,0..."
24881,,,,"0.0,0.09574384391471953,0.05066477853827393,0...."


# 2. Inspect Data

In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24883 entries, 0 to 24882
Data columns (total 4 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   timestamp                                 24877 non-null  object 
 1   customer_no                               24877 non-null  float64
 2   location                                  24877 non-null  object 
 3   entry,dairy,spices,drinks,fruit,checkout  6 non-null      object 
dtypes: float64(1), object(3)
memory usage: 777.7+ KB


In [6]:
#convert column timestamp to datetime
df_raw['timestamp'] = pd.to_datetime(df_raw['timestamp'])
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24883 entries, 0 to 24882
Data columns (total 4 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   timestamp                                 24877 non-null  datetime64[ns]
 1   customer_no                               24877 non-null  float64       
 2   location                                  24877 non-null  object        
 3   entry,dairy,spices,drinks,fruit,checkout  6 non-null      object        
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 777.7+ KB


## create df missing checkouts

In [7]:
# number of state checkout
customer_checkout=df_raw['location'].value_counts()[0]
customer_checkout

7417

In [8]:
# number of all customers in the week
customer_total=df_raw.groupby([df_raw['timestamp'].dt.date, 'customer_no']).first().shape[0]
customer_total

7445

In [9]:
#customers without checkout timestemp
not_checkout=customer_total-customer_checkout
not_checkout

28

In [10]:
#create column with last recorded timestamp for each customer per day (grouped by day+customer_no)
df_raw['timestamp_last'] = df_raw.groupby([df_raw['timestamp'].dt.date, df_raw['customer_no']])['timestamp'].transform('last')
df_raw

Unnamed: 0,timestamp,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout",timestamp_last
0,2019-09-02 07:03:00,1.0,dairy,,2019-09-02 07:05:00
1,2019-09-02 07:03:00,2.0,dairy,,2019-09-02 07:06:00
2,2019-09-02 07:04:00,3.0,dairy,,2019-09-02 07:06:00
3,2019-09-02 07:04:00,4.0,dairy,,2019-09-02 07:08:00
4,2019-09-02 07:04:00,5.0,spices,,2019-09-02 07:05:00
...,...,...,...,...,...
24878,NaT,,,"0.0,0.7370226646420336,0.05134694336651482,0.0...",NaT
24879,NaT,,,"0.0,0.19321439949028352,0.4021981522777955,0.1...",NaT
24880,NaT,,,"0.0,0.010899742930591259,0.08699228791773779,0...",NaT
24881,NaT,,,"0.0,0.09574384391471953,0.05066477853827393,0....",NaT


In [11]:
df_missing_checkouts = df_raw.loc[(df_raw['timestamp'] == df_raw['timestamp_last']) & (df_raw['location'] != 'checkout')]
df_missing_checkouts

Unnamed: 0,timestamp,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout",timestamp_last
4861,2019-09-02 21:46:00,1439.0,fruit,,2019-09-02 21:46:00
4866,2019-09-02 21:47:00,1437.0,dairy,,2019-09-02 21:47:00
4868,2019-09-02 21:48:00,1443.0,dairy,,2019-09-02 21:48:00
4875,2019-09-02 21:49:00,1430.0,fruit,,2019-09-02 21:49:00
4876,2019-09-02 21:49:00,1433.0,fruit,,2019-09-02 21:49:00
4877,2019-09-02 21:49:00,1440.0,spices,,2019-09-02 21:49:00
4878,2019-09-02 21:49:00,1441.0,spices,,2019-09-02 21:49:00
4881,2019-09-02 21:49:00,1445.0,dairy,,2019-09-02 21:49:00
4882,2019-09-02 21:50:00,1446.0,dairy,,2019-09-02 21:50:00
4883,2019-09-02 21:50:00,1447.0,fruit,,2019-09-02 21:50:00


## create df missing entries

In [12]:
#create column with last recorded timestamp for each customer per day (grouped by day+customer_no)
df_raw['timestamp_first'] = df_raw.groupby([df_raw['timestamp'].dt.date, df_raw['customer_no']])['timestamp'].transform('first')
df_raw

Unnamed: 0,timestamp,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout",timestamp_last,timestamp_first
0,2019-09-02 07:03:00,1.0,dairy,,2019-09-02 07:05:00,2019-09-02 07:03:00
1,2019-09-02 07:03:00,2.0,dairy,,2019-09-02 07:06:00,2019-09-02 07:03:00
2,2019-09-02 07:04:00,3.0,dairy,,2019-09-02 07:06:00,2019-09-02 07:04:00
3,2019-09-02 07:04:00,4.0,dairy,,2019-09-02 07:08:00,2019-09-02 07:04:00
4,2019-09-02 07:04:00,5.0,spices,,2019-09-02 07:05:00,2019-09-02 07:04:00
...,...,...,...,...,...,...
24878,NaT,,,"0.0,0.7370226646420336,0.05134694336651482,0.0...",NaT,NaT
24879,NaT,,,"0.0,0.19321439949028352,0.4021981522777955,0.1...",NaT,NaT
24880,NaT,,,"0.0,0.010899742930591259,0.08699228791773779,0...",NaT,NaT
24881,NaT,,,"0.0,0.09574384391471953,0.05066477853827393,0....",NaT,NaT


In [13]:
df_missing_entries = df_raw.loc[(df_raw['timestamp'] == df_raw['timestamp_first'])]
df_missing_entries

Unnamed: 0,timestamp,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout",timestamp_last,timestamp_first
0,2019-09-02 07:03:00,1.0,dairy,,2019-09-02 07:05:00,2019-09-02 07:03:00
1,2019-09-02 07:03:00,2.0,dairy,,2019-09-02 07:06:00,2019-09-02 07:03:00
2,2019-09-02 07:04:00,3.0,dairy,,2019-09-02 07:06:00,2019-09-02 07:04:00
3,2019-09-02 07:04:00,4.0,dairy,,2019-09-02 07:08:00,2019-09-02 07:04:00
4,2019-09-02 07:04:00,5.0,spices,,2019-09-02 07:05:00,2019-09-02 07:04:00
...,...,...,...,...,...,...
24864,2019-09-06 21:48:00,1506.0,dairy,,2019-09-06 21:48:00,2019-09-06 21:48:00
24865,2019-09-06 21:48:00,1507.0,dairy,,2019-09-06 21:50:00,2019-09-06 21:48:00
24866,2019-09-06 21:48:00,1508.0,dairy,,2019-09-06 21:50:00,2019-09-06 21:48:00
24873,2019-09-06 21:50:00,1509.0,drinks,,2019-09-06 21:50:00,2019-09-06 21:50:00


In [14]:
#all customers have no Entry timestep?
df_missing_entries.shape[0]==customer_total

True

# 3. Preprocessing

## 3.1. Missing checkouts/entries

In [15]:
def insert_rows(df_original,df_checkout,df_entry):
    #Name location to checkout
    df_checkout.loc[:,['location']] = ['checkout']  
    #change time-stamp higher
    df_checkout['timestamp'] = df_checkout['timestamp'] + pd.Timedelta(minutes=1)
    
    #Name location to entry
    df_entry.loc[:,['location']] = ['entry']  
    #change time-stamp lower
    df_entry['timestamp'] = df_entry['timestamp'] + pd.Timedelta(minutes=-1)
    
    #concat to original
    df_new=pd.concat([df_original,df_checkout,df_entry],axis=0)
    
    print(f' Sum rows of all df`s is equal to rows df_clean: {df_original.shape[0]+df_checkout.shape[0]+df_entry.shape[0]==df_new.shape[0]}')
    
    return df_new

In [16]:
# insert missing checkout rows
df_clean=insert_rows(df_raw,df_missing_checkouts,df_missing_entries)
df_clean

 Sum rows of all df`s is equal to rows df_clean: True


Unnamed: 0,timestamp,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout",timestamp_last,timestamp_first
0,2019-09-02 07:03:00,1.0,dairy,,2019-09-02 07:05:00,2019-09-02 07:03:00
1,2019-09-02 07:03:00,2.0,dairy,,2019-09-02 07:06:00,2019-09-02 07:03:00
2,2019-09-02 07:04:00,3.0,dairy,,2019-09-02 07:06:00,2019-09-02 07:04:00
3,2019-09-02 07:04:00,4.0,dairy,,2019-09-02 07:08:00,2019-09-02 07:04:00
4,2019-09-02 07:04:00,5.0,spices,,2019-09-02 07:05:00,2019-09-02 07:04:00
...,...,...,...,...,...,...
24864,2019-09-06 21:47:00,1506.0,entry,,2019-09-06 21:48:00,2019-09-06 21:48:00
24865,2019-09-06 21:47:00,1507.0,entry,,2019-09-06 21:50:00,2019-09-06 21:48:00
24866,2019-09-06 21:47:00,1508.0,entry,,2019-09-06 21:50:00,2019-09-06 21:48:00
24873,2019-09-06 21:49:00,1509.0,entry,,2019-09-06 21:50:00,2019-09-06 21:50:00


In [17]:
#df_clean['shifty']=df_clean.groupby([df_clean['timestamp'].dt.date,'customer_no'])[['location']].count()
df_clean.loc[df_clean['customer_no'] == 1].sort_values('timestamp')

Unnamed: 0,timestamp,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout",timestamp_last,timestamp_first
0,2019-09-02 07:02:00,1.0,entry,,2019-09-02 07:05:00,2019-09-02 07:03:00
0,2019-09-02 07:03:00,1.0,dairy,,2019-09-02 07:05:00,2019-09-02 07:03:00
8,2019-09-02 07:05:00,1.0,checkout,,2019-09-02 07:05:00,2019-09-02 07:03:00
4885,2019-09-03 07:01:00,1.0,entry,,2019-09-03 07:12:00,2019-09-03 07:02:00
4885,2019-09-03 07:02:00,1.0,fruit,,2019-09-03 07:12:00,2019-09-03 07:02:00
4887,2019-09-03 07:05:00,1.0,drinks,,2019-09-03 07:12:00,2019-09-03 07:02:00
4913,2019-09-03 07:12:00,1.0,checkout,,2019-09-03 07:12:00,2019-09-03 07:02:00
9603,2019-09-04 06:59:00,1.0,entry,,2019-09-04 07:02:00,2019-09-04 07:00:00
9603,2019-09-04 07:00:00,1.0,fruit,,2019-09-04 07:02:00,2019-09-04 07:00:00
9607,2019-09-04 07:02:00,1.0,checkout,,2019-09-04 07:02:00,2019-09-04 07:00:00


## 3.2. Forward fill every minute

In [18]:
#Create date (yyyy-mm-dd) column
df_clean['date']=df_clean['timestamp'].dt.date
df_clean

Unnamed: 0,timestamp,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout",timestamp_last,timestamp_first,date
0,2019-09-02 07:03:00,1.0,dairy,,2019-09-02 07:05:00,2019-09-02 07:03:00,2019-09-02
1,2019-09-02 07:03:00,2.0,dairy,,2019-09-02 07:06:00,2019-09-02 07:03:00,2019-09-02
2,2019-09-02 07:04:00,3.0,dairy,,2019-09-02 07:06:00,2019-09-02 07:04:00,2019-09-02
3,2019-09-02 07:04:00,4.0,dairy,,2019-09-02 07:08:00,2019-09-02 07:04:00,2019-09-02
4,2019-09-02 07:04:00,5.0,spices,,2019-09-02 07:05:00,2019-09-02 07:04:00,2019-09-02
...,...,...,...,...,...,...,...
24864,2019-09-06 21:47:00,1506.0,entry,,2019-09-06 21:48:00,2019-09-06 21:48:00,2019-09-06
24865,2019-09-06 21:47:00,1507.0,entry,,2019-09-06 21:50:00,2019-09-06 21:48:00,2019-09-06
24866,2019-09-06 21:47:00,1508.0,entry,,2019-09-06 21:50:00,2019-09-06 21:48:00,2019-09-06
24873,2019-09-06 21:49:00,1509.0,entry,,2019-09-06 21:50:00,2019-09-06 21:50:00,2019-09-06


In [19]:
#Set index to timestamp
df_test=df_clean.set_index('timestamp').sort_index()
df_test
#df_test.loc[df_test['customer_no'] == 1] # .sort_values('timestamp')

Unnamed: 0_level_0,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout",timestamp_last,timestamp_first,date
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-02 07:02:00,1.0,entry,,2019-09-02 07:05:00,2019-09-02 07:03:00,2019-09-02
2019-09-02 07:02:00,2.0,entry,,2019-09-02 07:06:00,2019-09-02 07:03:00,2019-09-02
2019-09-02 07:03:00,3.0,entry,,2019-09-02 07:06:00,2019-09-02 07:04:00,2019-09-02
2019-09-02 07:03:00,4.0,entry,,2019-09-02 07:08:00,2019-09-02 07:04:00,2019-09-02
2019-09-02 07:03:00,5.0,entry,,2019-09-02 07:05:00,2019-09-02 07:04:00,2019-09-02
...,...,...,...,...,...,...
NaT,,,"0.0,0.7370226646420336,0.05134694336651482,0.0...",NaT,NaT,NaT
NaT,,,"0.0,0.19321439949028352,0.4021981522777955,0.1...",NaT,NaT,NaT
NaT,,,"0.0,0.010899742930591259,0.08699228791773779,0...",NaT,NaT,NaT
NaT,,,"0.0,0.09574384391471953,0.05066477853827393,0....",NaT,NaT,NaT


In [None]:
list(df_test.groupby(['customer_no','date']))

In [21]:
# Forward fill missing timesteps (every minute is necessary)
# resample gets applied to each dataframe inside the groupby
df_resampled = df_test.groupby(['customer_no','date']).resample('60S').first().ffill()
df_resampled

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout",timestamp_last,timestamp_first,date
customer_no,date,timestamp,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,2019-09-02,2019-09-02 07:02:00,1.0,entry,,2019-09-02 07:05:00,2019-09-02 07:03:00,2019-09-02
1.0,2019-09-02,2019-09-02 07:03:00,1.0,dairy,,2019-09-02 07:05:00,2019-09-02 07:03:00,2019-09-02
1.0,2019-09-02,2019-09-02 07:04:00,1.0,dairy,,2019-09-02 07:05:00,2019-09-02 07:03:00,2019-09-02
1.0,2019-09-02,2019-09-02 07:05:00,1.0,checkout,,2019-09-02 07:05:00,2019-09-02 07:03:00,2019-09-02
1.0,2019-09-03,2019-09-03 07:01:00,1.0,entry,,2019-09-03 07:12:00,2019-09-03 07:02:00,2019-09-03
...,...,...,...,...,...,...,...,...
1534.0,2019-09-05,2019-09-05 21:48:00,1534.0,fruit,,2019-09-05 21:49:00,2019-09-05 21:48:00,2019-09-05
1534.0,2019-09-05,2019-09-05 21:49:00,1534.0,checkout,,2019-09-05 21:49:00,2019-09-05 21:48:00,2019-09-05
1535.0,2019-09-05,2019-09-05 21:47:00,1535.0,entry,,2019-09-05 21:49:00,2019-09-05 21:48:00,2019-09-05
1535.0,2019-09-05,2019-09-05 21:48:00,1535.0,spices,,2019-09-05 21:49:00,2019-09-05 21:48:00,2019-09-05


 # 4. EDA

### Total number of customers in each section
//bei uhrzeit gruppieren

In [None]:
mon1 = mon.copy()
mon1

In [None]:
#mon1.reset_index(drop=True, inplace=True)
mon1_count_customer_location_series = mon1.groupby(by=["location"])["location"].count()
mon1_count_customer_location_series

In [None]:
mon1_count_customer_location_df = mon1.groupby(by=["location"]).count()
mon1_count_customer_location_df

In [None]:
type(mon1)

In [None]:
mon.shape

### Total number of customers in each section over time

In [None]:
mon2 = mon.copy()

In [None]:
mon2_grouped_column = mon2.groupby(by=["location"]).count()

In [None]:
mon2_grouped_column

Unnamed: 0_level_0,customer_no
location,Unnamed: 1_level_1
checkout,1437
dairy,3495
drinks,2047
fruit,2558
spices,1228


In [None]:
#locations = ["drinks", "dairy", "spices", "fruit", "checkout"]

In [None]:
#for location in locations:
#    mon2[location+"_cu#"] = ""

In [None]:
mon2

Unnamed: 0_level_0,customer_no,location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-09-02 07:03:00,1.0,dairy
2019-09-02 07:04:00,1.0,dairy
2019-09-02 07:05:00,1.0,checkout
2019-09-02 07:03:00,2.0,dairy
2019-09-02 07:04:00,2.0,dairy
...,...,...
2019-09-02 21:48:00,1444.0,spices
2019-09-02 21:49:00,1444.0,checkout
2019-09-02 21:49:00,1445.0,dairy
2019-09-02 21:50:00,1446.0,dairy


In [None]:
# Display the number of customers at checkout over time                  FLORIAN
mon_df.groupby(['timestamp','location'])[['customer_no']].count()

In [None]:
# Calculate the total number of customers in the supermarket over time.  FLORIAN
cust_over_time = pd.DataFrame(mon_df['customer_no'].groupby('timestamp').count())
cust_over_time

# 5. Markov Chain Model

## 5.1. Shift

In [22]:
#Creat shifted column
df_resampled['location_next'] = df_resampled['location'].shift(-1)
df_resampled

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,customer_no,location,"entry,dairy,spices,drinks,fruit,checkout",timestamp_last,timestamp_first,date,location_next
customer_no,date,timestamp,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,2019-09-02,2019-09-02 07:02:00,1.0,entry,,2019-09-02 07:05:00,2019-09-02 07:03:00,2019-09-02,dairy
1.0,2019-09-02,2019-09-02 07:03:00,1.0,dairy,,2019-09-02 07:05:00,2019-09-02 07:03:00,2019-09-02,dairy
1.0,2019-09-02,2019-09-02 07:04:00,1.0,dairy,,2019-09-02 07:05:00,2019-09-02 07:03:00,2019-09-02,checkout
1.0,2019-09-02,2019-09-02 07:05:00,1.0,checkout,,2019-09-02 07:05:00,2019-09-02 07:03:00,2019-09-02,entry
1.0,2019-09-03,2019-09-03 07:01:00,1.0,entry,,2019-09-03 07:12:00,2019-09-03 07:02:00,2019-09-03,fruit
...,...,...,...,...,...,...,...,...,...
1534.0,2019-09-05,2019-09-05 21:48:00,1534.0,fruit,,2019-09-05 21:49:00,2019-09-05 21:48:00,2019-09-05,checkout
1534.0,2019-09-05,2019-09-05 21:49:00,1534.0,checkout,,2019-09-05 21:49:00,2019-09-05 21:48:00,2019-09-05,entry
1535.0,2019-09-05,2019-09-05 21:47:00,1535.0,entry,,2019-09-05 21:49:00,2019-09-05 21:48:00,2019-09-05,spices
1535.0,2019-09-05,2019-09-05 21:48:00,1535.0,spices,,2019-09-05 21:49:00,2019-09-05 21:48:00,2019-09-05,checkout


In [23]:
#avoid checkout --> entry pairs
#instead self-reference checkout-->checkout
df_resampled['location_next']
df_resampled["location_next"][(df_resampled['location_next']=='entry')] = 'checkout'
df_resampled[['customer_no','location','location_next']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,customer_no,location,location_next
customer_no,date,timestamp,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,2019-09-02,2019-09-02 07:02:00,1.0,entry,dairy
1.0,2019-09-02,2019-09-02 07:03:00,1.0,dairy,dairy
1.0,2019-09-02,2019-09-02 07:04:00,1.0,dairy,checkout
1.0,2019-09-02,2019-09-02 07:05:00,1.0,checkout,checkout
1.0,2019-09-03,2019-09-03 07:01:00,1.0,entry,fruit
...,...,...,...,...,...
1534.0,2019-09-05,2019-09-05 21:48:00,1534.0,fruit,checkout
1534.0,2019-09-05,2019-09-05 21:49:00,1534.0,checkout,checkout
1535.0,2019-09-05,2019-09-05 21:47:00,1535.0,entry,spices
1535.0,2019-09-05,2019-09-05 21:48:00,1535.0,spices,checkout


## 5.2. Transition matrix

In [24]:
# If passed ‘index’ will normalize over each row.
# default is False , which will give you the count

P = pd.crosstab(
    df_resampled['location'], 
    df_resampled['location_next'], normalize='index')
P

location_next,checkout,dairy,drinks,fruit,spices
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
checkout,1.0,0.0,0.0,0.0,0.0
dairy,0.103466,0.73675,0.058569,0.049848,0.051367
drinks,0.21571,0.010899,0.598499,0.087909,0.086983
entry,0.0,0.287576,0.153526,0.377435,0.181464
fruit,0.201605,0.095924,0.054847,0.596947,0.050677
spices,0.150685,0.193214,0.163109,0.090953,0.402039


In [25]:
# insert a new column "entry" for next-state to create a 6x6 Matrix
P['entry']=0.00000
P

location_next,checkout,dairy,drinks,fruit,spices,entry
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
checkout,1.0,0.0,0.0,0.0,0.0,0.0
dairy,0.103466,0.73675,0.058569,0.049848,0.051367,0.0
drinks,0.21571,0.010899,0.598499,0.087909,0.086983,0.0
entry,0.0,0.287576,0.153526,0.377435,0.181464,0.0
fruit,0.201605,0.095924,0.054847,0.596947,0.050677,0.0
spices,0.150685,0.193214,0.163109,0.090953,0.402039,0.0


In [29]:
P.to_csv("transition_probability_matrix.csv", sep=";")

In [27]:
#test one customer for just step+1
initial_state = np.array([0,0,1,0,0,0])

# perform matrix dot multiplication
np.dot(initial_state, P)

array([0.21571047, 0.01089862, 0.59849887, 0.0879087 , 0.08698334,
       0.        ])

In [28]:
#forecast 10 minutes
initial_state = np.array([0,0,0,1,0,0]) 
states = [initial_state] # random walk 
next_state = np.nan

for i in range(10):
    next_state = states[i].dot(P)
    states.append(next_state)

next_state

array([0.66095253, 0.16109686, 0.07674031, 0.06830655, 0.03290376,
       0.        ])