In [1]:
import pandas as pd

In [2]:
source = pd.read_excel('balanced_all_data_1_bar.xlsx')

In [None]:
source

In [5]:
def create_aggregated_database(input_file: str, output_file: str, pattern_quantity: int, inclusive: bool = False):
    """
    Create a new database by aggregating sequences of MIDI patterns from the same file.
    Supports both non-overlapping and overlapping (inclusive) combinations.
    
    Args:
        input_file (str): Path to the input Excel file.
        output_file (str): Path to save the output Excel file.
        pattern_quantity (int): Number of patterns to combine into a single row.
        inclusive (bool): Whether to use overlapping combinations. Default is False.
    
    Returns:
        None
    """
    # Read the input file
    data = pd.read_excel(input_file)
    
    # Validate that required columns exist
    required_columns = {"file", "sequence", "class"}
    step_columns = {f"step_{i}" for i in range(16)}
    all_columns = required_columns.union(step_columns)
    if not all_columns.issubset(data.columns):
        raise ValueError(f"Input dataset must contain the columns: {all_columns}")

    # Initialize a list to store new rows
    new_rows = []
    
    # Group rows by the `file` column to ensure blending happens only within the same file
    for file, group in data.groupby("file"):
        # Sort by sequence to ensure correct order
        group = group.sort_values("sequence")
        
        # Determine the step size for the sliding window
        step_size = 1 if inclusive else pattern_quantity
        num_rows = len(group)
        
        # Iterate over the group with a sliding window
        for i in range(0, num_rows - pattern_quantity + 1, step_size):
            chunk = group.iloc[i:i+pattern_quantity]
            
            # Aggregate the data
            aggregated_row = {
                "file": file,
                "sequence": "-".join(map(str, chunk["sequence"])),
                "class": chunk["class"].iloc[0]  # All rows in the chunk should have the same class
            }
            
            # Validate that all rows in the chunk belong to the same class
            if len(chunk["class"].unique()) > 1:
                raise ValueError(f"Found multiple classes in the same file ({file}).")

            # Concatenate the features from each row in the chunk
            for j, row in enumerate(chunk.itertuples(index=False)):
                for k in range(16):  # Each sequence has 16 features
                    feature_name = f"feature_{j * 16 + k}"
                    aggregated_row[feature_name] = getattr(row, f"step_{k}")
            
            # Add the new row to the list
            new_rows.append(aggregated_row)
    
    # Create a new DataFrame and save it to an Excel file
    new_data = pd.DataFrame(new_rows)
    new_data.to_excel(output_file, index=False)
    print(f"New database saved to {output_file}")

# # Example usage:
# # Non-overlapping (default)
# create_aggregated_database("input_dataset.xlsx", "output_dataset_non_overlapping.xlsx", pattern_quantity=2)

# # Overlapping (inclusive=True)
# create_aggregated_database("input_dataset.xlsx", "output_dataset_inclusive.xlsx", pattern_quantity=2, inclusive=True)


In [8]:
# create_aggregated_database("fwod_representations_clean.xlsx", "data_pattern_2.xlsx", pattern_quantity=2)
create_aggregated_database("balanced_all_data_1_bar.xlsx", "data_pattern_4_inclusive_balanced.xlsx", pattern_quantity=4, inclusive=True)

New database saved to data_pattern_4_inclusive_balanced.xlsx
