In [1]:
import sys

sys.path.append('../')

In [2]:
from typing import Text
import pandas as pd

In [3]:
def create_mock_raw_dataframe() -> pd.DataFrame:
    """ Create raw mock dataframe """
    return pd.DataFrame({
        'ID': ['S1']*9 + ['S2']*7 + ['S3']*8 + ['S4']*7,
        'Time Points': [0, 1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 13, 14, 15, 16, 16, 17, 18, 19],
        'Items': ['a', 'a', 'b', 'c', 'a', 'c', 'd', 'c', 'f', 'a', 'd', 'c', 'b', 'c', 'a', 'e', 'e', 'f', 'a', 'b', 'd', 'f', 'c', 'b', 'e', 'g', 'a', 'f', 'c', 'b', 'c'],
    })

def create_mock_raw_dataframe_nosep() -> pd.DataFrame:
    """ Create raw mock dataframe """
    return pd.DataFrame({
        'ID': ['S1']*16,
        'Time Points': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        'Items': ['A', 'A', 'G', 'T', 'A', 'C', 'G', 'A', 'C', 'G', 'C', 'A', 'T', 'C', 'T', 'A'],
    })

test_file_path = '../tests/test_files/contextPrefixSpan.txt'

In [4]:
# Raw input - Sequence IDs and Itemsets can have arbitrary labels and time points may be repeated for simultaneous events 
mock_df_raw = create_mock_raw_dataframe()
mock_df_raw

Unnamed: 0,ID,Time Points,Items
0,S1,0,a
1,S1,1,a
2,S1,1,b
3,S1,1,c
4,S1,2,a
5,S1,2,c
6,S1,3,d
7,S1,4,c
8,S1,4,f
9,S2,5,a


In [5]:
# Raw input - for NOSEP
create_mock_raw_dataframe_nosep()

Unnamed: 0,ID,Time Points,Items
0,S1,1,A
1,S1,2,A
2,S1,3,G
3,S1,4,T
4,S1,5,A
5,S1,6,C
6,S1,7,G
7,S1,8,A
8,S1,9,C
9,S1,10,G


### Sequential Pattern Mining Examples

In [6]:
from spmf import PrefixSpan

prefixspan = PrefixSpan(min_support=0.5)
mock_df = create_mock_raw_dataframe()
output = prefixspan.run_pandas(mock_df)
output

Unnamed: 0,Frequent sequential pattern,Support
0,a,4
1,a b,2
2,a b -> c,2
3,a b -> d,2
4,a b -> d -> c,2
5,a b -> f,2
6,a -> a,2
7,a -> b,4
8,a -> b c,2
9,a -> b c -> a,2


In [7]:
from spmf import SPADE

spade = SPADE(min_support=0.5)
mock_df = create_mock_raw_dataframe()
output = spade.run_pandas(mock_df)
output

Unnamed: 0,Frequent sequential pattern,Support
0,a,4
1,b,4
2,c,4
3,d,3
4,e,3
5,f,3
6,e -> f,2
7,f -> c,2
8,b -> f,2
9,f -> b,2


In [8]:
from spmf import CMSPADE

cmspade = CMSPADE(min_support=0.5)
mock_df = create_mock_raw_dataframe()
output = cmspade.run_pandas(mock_df)
output

Unnamed: 0,Frequent sequential pattern,Support
0,a,4
1,b,4
2,c,4
3,d,3
4,e,3
5,f,3
6,e -> f,2
7,f -> c,2
8,b -> f,2
9,f -> b,2


In [9]:
from spmf import SPAM

spam = SPAM(min_support=0.5)
mock_df = create_mock_raw_dataframe()
output = spam.run_pandas(mock_df)
output

Unnamed: 0,Frequent sequential pattern,Support
0,a,4
1,b,4
2,c,4
3,d,3
4,e,3
5,f,3
6,a -> a,2
7,a -> b,4
8,a -> b -> a,2
9,a -> b -> c,2


In [10]:
from spmf import CMClaSP

cmclasp = CMClaSP(min_support=0.5)
mock_df = create_mock_raw_dataframe()
output = cmclasp.run_pandas(mock_df)
output

Before removing NonClosed patterns there are 45 patterns


Unnamed: 0,Frequent sequential pattern,Support
0,a -> b c -> a,2
1,a b -> d -> c,2
2,a b -> f,2
3,a -> b -> c,2
4,d -> c -> b,2
5,d -> c,3
6,a -> c -> c,3
7,e -> a -> c -> b,2
8,e -> b -> c,2
9,e -> f -> c -> b,2


In [11]:
from spmf.seq_pat import VMSP

vmsp = VMSP(min_support=0.5)
mock_df = create_mock_raw_dataframe()
output = vmsp.run_pandas(mock_df)
output

Unnamed: 0,Frequent sequential pattern,Support
0,f -> b -> c,2
1,e -> b -> c,2
2,a b -> f,2
3,d -> c -> b,2
4,a -> c -> c,3
5,a -> b -> c,2
6,e -> f -> c -> b,2
7,e -> a -> c -> b,2
8,a b -> d -> c,2
9,a -> b c -> a,2


In [12]:
from spmf import VGEN

vgen = VGEN(min_support=0.5)
mock_df = create_mock_raw_dataframe()
output = vgen.run_pandas(mock_df)
output

Unnamed: 0,Frequent sequential pattern,Support
0,,4
1,d,3
2,f,3
3,e,3
4,a -> a,2
5,b -> a,2
6,b c,2
7,c -> a,2
8,a -> d,2
9,a -> f,2


In [13]:
from spmf import NOSEP

nosep = NOSEP(min_pattern_length=1, max_pattern_length=20, min_gap=0, max_gap=2, min_support=3)
mock_df = create_mock_raw_dataframe_nosep()
output = nosep.run_pandas(mock_df)
output

Unnamed: 0,Frequent sequential pattern,Support
0,A,6
1,C,4
2,G,3
3,T,3
4,A -> A,3
5,A -> C,3
6,A -> G,3
7,C -> A,3
8,C -> C,3
9,G -> A,3


In [14]:
from spmf import TKS

tks = TKS(k=5)
mock_df = create_mock_raw_dataframe()
output = tks.run_pandas(mock_df)
output

Unnamed: 0,Frequent sequential pattern,Support
0,c,4
1,a -> b,4
2,b,4
3,a -> c,4
4,a,4
