In [None]:
import pandas as pd
from collections import defaultdict
from itertools import combinations


## Preprocess Data

In [21]:
def preprocess_triplegs(df, timestamp_col='started_at'):
    """
    Preprocess the triplegs data by extracting sequences of triplegs for each user.

    Parameters:
    df (pd.DataFrame): The triplegs DataFrame.
    timestamp_col (str): The name of the column containing the timestamp.

    Returns:
    list: A list of sequences of triplegs for each user.
    """
    # Convert the timestamp column to datetime
    df[timestamp_col] = pd.to_datetime(df[timestamp_col])

    # Define the starting date and filter data to include only the first 30 days
    start_date = pd.to_datetime('1900-01-01').tz_localize('UTC')
    end_date = start_date + pd.Timedelta(days=30)
    df = df[(df[timestamp_col] >= start_date) & (df[timestamp_col] < end_date)]

    # Extract sequences of triplegs for each user
    user_sequences = df.groupby('user_id')['geom'].apply(list).tolist()
    return user_sequences

In [None]:
def split_long_triplegs(df, max_length=1000): #test with different values 
    """
    Split long triplegs into shorter sub-triplegs.

    Parameters:
    df (pd.DataFrame): The triplegs DataFrame.
    max_length (int): The maximum length of a tripleg.

    Returns:
    pd.DataFrame: The DataFrame with long triplegs split into shorter sub-triplegs.
    """
    def split_geom(geom, max_length):
        points = geom.split(',')
        sub_triplegs = [','.join(points[i:i + max_length]) for i in range(0, len(points), max_length)]
        return sub_triplegs

    new_rows = []
    for _, row in df.iterrows():
        geoms = split_geom(row['geom'], max_length)
        for geom in geoms:
            new_row = row.copy()
            new_row['geom'] = geom
            new_rows.append(new_row)

    return pd.DataFrame(new_rows)

## Apply GSP

In [31]:
def generate_candidates(sequences, length):
    """Generate candidate sequences of a given length."""
    candidates = set()
    for seq in sequences:
        for i in range(len(seq) - length + 1):
            candidates.add(tuple(seq[i:i + length]))
    return candidates

def count_support(candidates, sequences):
    """Count the support of each candidate sequence in the dataset."""
    support_count = defaultdict(int)
    for candidate in candidates:
        for seq in sequences:
            if is_subsequence(candidate, seq):
                support_count[candidate] += 1
    return support_count

def is_subsequence(candidate, sequence):
    """Check if candidate is a subsequence of sequence."""
    it = iter(sequence)
    return all(item in it for item in candidate)

def prune_candidates(support_count, min_support):
    """Prune candidate sequences that do not meet the minimum support threshold."""
    return {seq: count for seq, count in support_count.items() if count >= min_support}

def generate_new_candidates(frequent_sequences, length):
    """Generate new candidate sequences by joining frequent sequences."""
    new_candidates = set()
    for seq1, seq2 in combinations(frequent_sequences, 2):
        if seq1[:-1] == seq2[:-1]:
            new_candidates.add(seq1 + (seq2[-1],))
    return new_candidates

def gsp(sequences, min_support):
    """Implement the GSP algorithm to mine sequential patterns."""
    length = 1
    frequent_sequences = generate_candidates(sequences, length)
    all_frequent_sequences = []

    while frequent_sequences:
        support_count = count_support(frequent_sequences, sequences)
        frequent_sequences = prune_candidates(support_count, min_support)
        all_frequent_sequences.extend(frequent_sequences.keys())
        length += 1
        frequent_sequences = generate_new_candidates(frequent_sequences, length)

    return all_frequent_sequences

## Save Output to CSV

In [24]:
def save_gsp_results(gsp_results, output_file):
    """
    Save the GSP results to a CSV file.

    Parameters:
    gsp_results (list): A list of frequent sequences.
    output_file (str): The path to the output CSV file.
    """
    sequences_df = pd.DataFrame(gsp_results, columns=['Sequence'])
    sequences_df.to_csv(output_file, index=False)
    print(f"GSP results saved to {output_file}")

In [None]:
df = pd.read_csv('./triplegsD.csv', nrows =50000) #test with subset of data

# Split long triplegs into shorter sub-triplegs
df = split_long_triplegs(df)

# Preprocess the data and limit to the first month
sequences = preprocess_triplegs(df)

In [44]:
sequences

[['LINESTRING (115.0000000000000000 93.0000000000000000, 114.0000000000000000 114.0000000000000000, 119.0000000000000000 120.0000000000000000, 137.0000000000000000 101.0000000000000000, 132.0000000000000000 99.0000000000000000, 133.0000000000000000 103.0000000000000000, 122.0000000000000000 99.0000000000000000)',
  'LINESTRING (121.0000000000000000 102.0000000000000000, 115.0000000000000000 109.0000000000000000, 118.0000000000000000 96.0000000000000000)',
  'LINESTRING (119.0000000000000000 99.0000000000000000, 131.0000000000000000 103.0000000000000000, 144.0000000000000000 98.0000000000000000, 143.0000000000000000 100.0000000000000000, 136.0000000000000000 122.0000000000000000, 123.0000000000000000 131.0000000000000000, 117.0000000000000000 134.0000000000000000, 124.0000000000000000 136.0000000000000000, 141.0000000000000000 105.0000000000000000, 144.0000000000000000 97.0000000000000000, 144.0000000000000000 98.0000000000000000, 164.0000000000000000 96.0000000000000000, 156.0000000000

In [None]:
# Define the minimum support threshold dynamically as x% of the dataset
min_support = int(0.05 * len(sequences)) #test with different values 

# Apply the GSP algorithm to find frequent sequences
frequent_sequences = gsp(sequences, min_support)



In [43]:
frequent_sequences

[]

In [None]:
# Save the GSP results to a CSV file
output_file = 'gsp_cityD.csv'
save_gsp_results(frequent_sequences, output_file)