In [92]:
from gsppy.gsp import GSP
import pandas as pd
import json
from prefixspan import PrefixSpan

In [93]:
sceneries_names = [
    "1-first",
    "2-second",
    "3-third",
    "4-fourth",
    "5-fifth",
    "6-sixth",
    "7-seventh",
    "8-eighth",
]

In [94]:
def gsp_mining(sequences_list: list, minsup: float = 0.08) -> list:
    return GSP(sequences_list).search(minsup)

In [95]:
def prefix_mining(sequences_list: list, minsup: float = 0.08, subsequence = None) -> list:
    ps = PrefixSpan(sequences_list)
    min_support = len(sequences_list)*minsup
    
    if subsequence:
        res = ps.frequent(min_support, filter=lambda patt, matches: is_subsequence(subsequence, patt))
        return res
    
    res = ps.frequent(min_support)
    
    max_seq_len = 0
    for index in res:
        if len(index[1]) > max_seq_len:
            max_seq_len = len(index[1])
    
    n_sequences = [{} for _ in range(max_seq_len)]
    
    for index in res:
        n_sequences[len(index[1]) - 1][f'{index[1]}'] = index[0]
        
    return n_sequences

In [96]:
def format_tf_data(s) -> list:
    data = []
    s = pd.DataFrame(data=s).to_dict(orient="records")
    for user_sequence in s:
        data.append([event["event"] for event in user_sequence["events"]])
    return data

In [97]:
def read_params(file: str) -> str:
    return f"./sceneries/{file}.json"

In [98]:
def write_result(data, file: str, path = 'sceneries_mining_results') -> None:
    with open(f"./{path}/{file}.json", "w+") as _file:
        json.dump(data, _file)

In [99]:
def remap_keys(user_sequences: dict, seq_quantities) -> dict:
    def process_key(key):
        modified_key = key.replace('[', '').replace(']', '').replace("'", "")
        return [part.strip() for part in modified_key.split(',')]
    
    return {f"{seq_quantities}_sequences": [
        {"sequence": process_key(k), "total": v} for k, v in user_sequences.items()
    ]}


In [100]:
def is_subsequence(subseq, sequence):
    len_subseq = len(subseq)
    len_sequence = len(sequence)
    i, j = 0, 0

    if len_subseq > len_sequence:
        return False

    while i < len_subseq and j < len_sequence:
        # subseq_element = subseq[i].strip() if isinstance(subseq[i], str) else subseq[i]
        # sequence_element = (sequence[j].strip() if isinstance(sequence[j], str) else sequence[j])
        if subseq[i] == sequence[j]:
            i += 1
        j += 1

    return i == len_subseq

In [101]:
def is_sublist_with_gap(sub, main, gap=2):
    if not sub:
        return True
        
    sub_len = len(sub)
    main_len = len(main)

    for start in range(main_len - sub_len + 1):
        sub_index = 0
        matches = 0

        for main_index in range(start, main_len):
            if main[main_index] == sub[sub_index]:
                matches += 1
                sub_index += 1
                if matches == sub_len:
                    return True
            elif main_index - start >= sub_index * (gap + 1):
                break

    return False

In [102]:
def extract_events(event_list):
     data = [event_dict['event'] for event_dict in event_list]
     return data

In [103]:
def get_supports(target, data):
    res = prefix_mining(formatted, minsup, sequence_list)
    f, i = len(res), 0
    for subsequences in res:
        i += subsequences[0]
    return f, i

In [104]:
def get_sequence_ids(target, data):
    ids = []
    for index, row in data.iterrows():
        if is_subsequence(target, row['events']):
            ids.append(row['key'])
    return ids

In [105]:
def get_sequence_grade(ids: list[int], data) -> float:
    grade = 0.0
    for index, row in data.iterrows():
        if row['key'] in ids:
            grade += row['grade']
    
    return round(grade/len(ids), 2)

In [106]:
sceneries = {}

In [107]:
for scenery in sceneries_names:
    print(scenery)
    minsup = 0.08
    file_name = read_params(scenery)
    all_sequences = pd.read_json(file_name)

    max_grade = int(all_sequences['max_grade'].iloc[0])

    formatted = format_tf_data(all_sequences)
    all_sequences['events'] = all_sequences['events'].apply(extract_events)
    
    # mining_result_gsp = gsp_mining(formatted)
    mining_result_prefix = prefix_mining(formatted, minsup)

    total_sequences = 0
    for sequences in mining_result_prefix:
        total_sequences += len(sequences)

    result = []
    for i in range(len(mining_result_prefix)):
        result.append(remap_keys(mining_result_prefix[i], i+1))
        
    # get I-support, F-support, most and least repeated sequences by sequence size  
    final_result = {f'{i}_sequences': {'sequences': [], 'most_repeated': {}, 'least_repeated': {}} for i in range(1, len(result) + 1)}
    for freq in result:
        for key, sequences in freq.items():
            most_repeated = { 'total': 0 }
            least_repeated = { 'total': 9999999 }
            for seq in sequences:
                sequence_list = seq["sequence"]
                i_support, f_support = get_supports(sequence_list, formatted)
                ids = get_sequence_ids(sequence_list, all_sequences)
                grade = get_sequence_grade(ids, all_sequences)
                sequence = {
                    'sequence_size': len(sequence_list),
                    'sequence': sequence_list,
                    'total': seq['total'],
                    'i_support': i_support,
                    'f_support': f_support,
                    's_support': len(ids),
                    'ids': ids,
                    'avg_grade': grade,
                    'max_grade': max_grade,
                }
                final_result[key]['sequences'].append(sequence)
                if sequence['total'] > most_repeated['total']: most_repeated = sequence
                if sequence['total'] < least_repeated['total']: least_repeated = sequence
            final_result[key]['most_repeated'] = most_repeated
            final_result[key]['least_repeated'] = least_repeated
        
    # print(json.dumps(final_result['2_sequences'], indent=2, default=lambda o: str(o)))
    sceneries[scenery] = final_result
    write_result(final_result, scenery, 'sceneries_results')
    # break

1-first
2-second
3-third
4-fourth
5-fifth
6-sixth
7-seventh
8-eighth


In [108]:
# print(json.dumps(final_result['2_sequences'], indent=2, default=lambda o: str(o)))