In [2]:
import pandas as pd

In [4]:
source = pd.read_excel('fwod_representations_clean.xlsx')

In [5]:
source

Unnamed: 0,file,sequence,class,step_0,step_1,step_2,step_3,step_4,step_5,step_6,step_7,step_8,step_9,step_10,step_11,step_12,step_13,step_14,step_15
0,magenta midi/pop/105_pop_132_beat_4-4.mid,0,pop,0.000000,0.000000,0.000000,0.053571,0.000000,0.000000,0.000000,1.000000,0.178571,0.071429,0.000000,0.428571,0.000000,0.058036,0.000000,0.000000
1,magenta midi/pop/105_pop_132_beat_4-4.mid,1,pop,0.897638,0.188976,0.000000,0.000000,1.000000,0.133858,0.000000,1.000000,0.000000,0.212598,0.000000,1.000000,0.000000,0.259843,0.000000,1.000000
2,magenta midi/pop/105_pop_132_beat_4-4.mid,2,pop,0.000000,0.377953,0.000000,1.000000,0.000000,0.000000,0.000000,0.157480,0.892388,0.049869,0.000000,0.889764,0.000000,0.068241,0.000000,0.774278
3,magenta midi/pop/105_pop_132_beat_4-4.mid,3,pop,0.000000,0.480315,0.000000,1.000000,0.000000,0.393701,0.000000,0.333333,0.000000,0.482940,0.000000,1.000000,0.000000,0.041995,0.000000,0.703412
4,magenta midi/pop/105_pop_132_beat_4-4.mid,4,pop,0.094488,0.433071,0.000000,1.000000,0.000000,0.000000,0.377953,0.299213,0.000000,0.419948,0.000000,0.666667,0.314961,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19769,magenta midi/jazz/funk/4_jazz-funk_116_beat_4-...,32,jazz,0.584830,0.668663,0.287425,0.059880,0.702595,1.000000,0.147705,0.622754,0.119760,0.604790,0.000000,0.481038,0.327345,0.510978,0.215569,0.433134
19770,magenta midi/jazz/funk/4_jazz-funk_116_beat_4-...,33,jazz,0.083650,0.692015,0.692015,0.904943,0.266160,0.714829,0.737643,0.486692,0.463878,0.798479,0.536122,0.882129,0.585551,0.832700,1.000000,0.524715
19771,magenta midi/jazz/funk/4_jazz-funk_116_beat_4-...,34,jazz,0.258427,0.561798,0.387640,0.730337,0.000000,1.000000,0.140449,0.500000,0.148876,0.629213,0.311798,0.626404,0.154494,0.603933,0.337079,0.587079
19772,magenta midi/jazz/funk/4_jazz-funk_116_beat_4-...,35,jazz,0.323129,0.778912,0.448980,0.894558,0.187075,0.772109,0.217687,0.948980,0.000000,1.000000,0.418367,0.833333,0.183673,0.721088,0.163265,0.323129


In [8]:
def create_aggregated_database(input_file: str, output_file: str, pattern_quantity: int):
    """
    Create a new database by aggregating sequences of MIDI patterns from the same file,
    discarding leftover rows that cannot form a complete group.
    
    Args:
        input_file (str): Path to the input Excel file.
        output_file (str): Path to save the output Excel file.
        pattern_quantity (int): Number of patterns to combine into a single row.
    
    Returns:
        None
    """
    # Read the input file
    data = pd.read_excel(input_file)
    
    # Validate that required columns exist
    required_columns = {"file", "sequence", "class"}
    step_columns = {f"step_{i}" for i in range(16)}
    all_columns = required_columns.union(step_columns)
    if not all_columns.issubset(data.columns):
        raise ValueError(f"Input dataset must contain the columns: {all_columns}")

    # Initialize a list to store new rows
    new_rows = []
    
    # Group rows by the `file` column to ensure blending happens only within the same file
    for file, group in data.groupby("file"):
        # Sort by sequence to ensure correct order
        group = group.sort_values("sequence")
        
        # Iterate over the group with a sliding window of size `pattern_quantity`
        num_rows = len(group)
        full_batches = (num_rows // pattern_quantity) * pattern_quantity  # Only process full batches

        for i in range(0, full_batches, pattern_quantity):
            chunk = group.iloc[i:i+pattern_quantity]
            
            # Aggregate the data
            aggregated_row = {
                "file": file,
                "sequence": "-".join(map(str, chunk["sequence"])),
                "class": chunk["class"].iloc[0]  # All rows in the chunk should have the same class
            }
            
            # Validate that all rows in the chunk belong to the same class
            if len(chunk["class"].unique()) > 1:
                raise ValueError(f"Found multiple classes in the same file ({file}).")

            # Concatenate the features from each row in the chunk
            for j, row in enumerate(chunk.itertuples(index=False)):
                for k in range(16):  # Each sequence has 16 features
                    feature_name = f"feature_{j * 16 + k}"
                    aggregated_row[feature_name] = getattr(row, f"step_{k}")
            
            # Add the new row to the list
            new_rows.append(aggregated_row)
    
    # Create a new DataFrame and save it to an Excel file
    new_data = pd.DataFrame(new_rows)
    new_data.to_excel(output_file, index=False)
    print(f"New database saved to {output_file}")


In [10]:
def create_aggregated_database(input_file: str, output_file: str, pattern_quantity: int, inclusive: bool = False):
    """
    Create a new database by aggregating sequences of MIDI patterns from the same file.
    Supports both non-overlapping and overlapping (inclusive) combinations.
    
    Args:
        input_file (str): Path to the input Excel file.
        output_file (str): Path to save the output Excel file.
        pattern_quantity (int): Number of patterns to combine into a single row.
        inclusive (bool): Whether to use overlapping combinations. Default is False.
    
    Returns:
        None
    """
    # Read the input file
    data = pd.read_excel(input_file)
    
    # Validate that required columns exist
    required_columns = {"file", "sequence", "class"}
    step_columns = {f"step_{i}" for i in range(16)}
    all_columns = required_columns.union(step_columns)
    if not all_columns.issubset(data.columns):
        raise ValueError(f"Input dataset must contain the columns: {all_columns}")

    # Initialize a list to store new rows
    new_rows = []
    
    # Group rows by the `file` column to ensure blending happens only within the same file
    for file, group in data.groupby("file"):
        # Sort by sequence to ensure correct order
        group = group.sort_values("sequence")
        
        # Determine the step size for the sliding window
        step_size = 1 if inclusive else pattern_quantity
        num_rows = len(group)
        
        # Iterate over the group with a sliding window
        for i in range(0, num_rows - pattern_quantity + 1, step_size):
            chunk = group.iloc[i:i+pattern_quantity]
            
            # Aggregate the data
            aggregated_row = {
                "file": file,
                "sequence": "-".join(map(str, chunk["sequence"])),
                "class": chunk["class"].iloc[0]  # All rows in the chunk should have the same class
            }
            
            # Validate that all rows in the chunk belong to the same class
            if len(chunk["class"].unique()) > 1:
                raise ValueError(f"Found multiple classes in the same file ({file}).")

            # Concatenate the features from each row in the chunk
            for j, row in enumerate(chunk.itertuples(index=False)):
                for k in range(16):  # Each sequence has 16 features
                    feature_name = f"feature_{j * 16 + k}"
                    aggregated_row[feature_name] = getattr(row, f"step_{k}")
            
            # Add the new row to the list
            new_rows.append(aggregated_row)
    
    # Create a new DataFrame and save it to an Excel file
    new_data = pd.DataFrame(new_rows)
    new_data.to_excel(output_file, index=False)
    print(f"New database saved to {output_file}")

# # Example usage:
# # Non-overlapping (default)
# create_aggregated_database("input_dataset.xlsx", "output_dataset_non_overlapping.xlsx", pattern_quantity=2)

# # Overlapping (inclusive=True)
# create_aggregated_database("input_dataset.xlsx", "output_dataset_inclusive.xlsx", pattern_quantity=2, inclusive=True)


In [None]:
# create_aggregated_database("fwod_representations_clean.xlsx", "data_pattern_2.xlsx", pattern_quantity=2)
create_aggregated_database("fwod_representations_clean.xlsx", "data_pattern_5_inclusive.xlsx", pattern_quantity=5, inclusive=True)

New database saved to data_pattern_5_inclusive.xlsx
