In [1]:
import os

os.chdir("..")

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [3]:
data_path = "data/catalogues_Ovary_SBS.tsv"
cosmic_path = "data/COSMIC_v3.4_SBS_GRCh37.txt"
output_folder = "data/processed"
output_filename = "Ordered_Ovary_SBS.csv"
ordered_data_path = os.path.join(output_folder, output_filename)

In [4]:
data = pd.read_csv(ordered_data_path, index_col = 0)

In [6]:
print(data.head())

         GEL-2343183-11  GEL-2036180-11  GEL-2000678-11  GEL-2722354-11  \
Type                                                                      
A[C>A]A             332              78             130              82   
A[C>A]C             228              58             124              70   
A[C>A]G              27              13              15              15   
A[C>A]T             301              46             100              56   
A[C>G]A             272              19              95              35   

         GEL-2265651-11  GEL-2010990-11  GEL-2684573-11  GEL-2347321-11  \
Type                                                                      
A[C>A]A              95             139             158             154   
A[C>A]C              89              84             118             106   
A[C>A]G               8              23              25              20   
A[C>A]T              89              91             118              94   
A[C>G]A              58 

In [7]:
from functions.data_handling import data_augmentation

In [8]:
data_aug = data_augmentation(data, augmentation= 2)

In [9]:
data_aug

Unnamed: 0,GEL-2343183-11_aug_0,GEL-2036180-11_aug_0,GEL-2000678-11_aug_0,GEL-2722354-11_aug_0,GEL-2265651-11_aug_0,GEL-2010990-11_aug_0,GEL-2684573-11_aug_0,GEL-2347321-11_aug_0,GEL-2636707-11_aug_0,GEL-2587944-11_aug_0,...,GEL-2158012-11_aug_1,GEL-2788568-11_aug_1,GEL-2360348-11_aug_1,GEL-2955659-11_aug_1,GEL-2486623-11_aug_1,GEL-2797572-11_aug_1,GEL-2250658-11_aug_1,GEL-2960800-11_aug_1,GEL-2415629-11_aug_1,GEL-2596443-11_aug_1
0,345,65,114,77,87,132,159,132,229,148,...,540,156,107,290,92,104,296,98,268,277
1,229,56,124,70,90,87,129,101,219,74,...,431,79,55,198,81,89,197,42,196,174
2,28,12,15,13,5,29,20,28,33,24,...,62,31,11,29,14,13,75,46,26,44
3,342,55,102,55,81,76,123,95,184,77,...,598,107,51,224,58,73,234,49,247,246
4,252,11,84,44,60,53,100,65,99,47,...,269,113,30,170,50,52,219,29,240,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,6199,36,107,69,72,87,100,227,132,72,...,313,76,54,202,80,55,120,223,169,200
92,59,11,46,9,22,32,49,45,43,15,...,105,29,12,57,17,25,59,5,73,47
93,69,18,32,7,24,14,22,58,59,23,...,64,14,15,30,15,18,68,5,53,79
94,398,23,62,14,23,32,56,49,51,30,...,204,30,29,109,31,39,84,26,61,82


In [16]:
import numpy as np
import pandas as pd

def data_augmentation2(X: pd.DataFrame, augmentation: int = 5) -> pd.DataFrame:
    '''
    Performs data augmentation by bootstrapping each tumour (column) `augmentation` times using a multinomial 
    distribution M(N, p), where:
    - N is the total mutation count for the tumour.
    - p is the relative frequency of each of the 96 mutational classes.

    Parameters:
    X (pd.DataFrame): Input count data (96 mutational signatures as rows, patients as columns).
    augmentation (int): Number of bootstrap samples to generate per tumour.

    Returns:
    pd.DataFrame: A new DataFrame containing only the augmented data (96 rows × (patients * augmentation) columns).
    '''

    augmented_columns = []

    for i in range(augmentation):
        X_bootstrapped = X.copy()  # Copy structure

        for col in X.columns:  # Iterate over patients
            N = np.sum(X[col])  # Total number of mutations for this patient
            if N == 0:
                # If no mutations, return zero vector instead of multinomial sampling
                X_bootstrapped[col] = np.zeros_like(X[col])
            else:
                p = X[col] / np.sum(X[col])  # Properly normalized probabilities
                X_bootstrapped[col] = np.random.multinomial(N, p)

        # Rename columns to indicate augmentation round
        X_bootstrapped.columns = [str(col) + '_aug_' + str(i) for col in X.columns]
        augmented_columns.append(X_bootstrapped)

    # Concatenate all augmented versions **horizontally**
    X_augmented = pd.concat(augmented_columns, axis=1)

    return X_augmented


In [19]:
data_aug2 = data_augmentation2(data, augmentation= 1)

In [22]:
data_aug2.head()

Unnamed: 0_level_0,GEL-2343183-11_aug_0,GEL-2036180-11_aug_0,GEL-2000678-11_aug_0,GEL-2722354-11_aug_0,GEL-2265651-11_aug_0,GEL-2010990-11_aug_0,GEL-2684573-11_aug_0,GEL-2347321-11_aug_0,GEL-2636707-11_aug_0,GEL-2587944-11_aug_0,...,GEL-2158012-11_aug_0,GEL-2788568-11_aug_0,GEL-2360348-11_aug_0,GEL-2955659-11_aug_0,GEL-2486623-11_aug_0,GEL-2797572-11_aug_0,GEL-2250658-11_aug_0,GEL-2960800-11_aug_0,GEL-2415629-11_aug_0,GEL-2596443-11_aug_0
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A[C>A]A,338,69,130,88,108,118,164,171,212,161,...,620,180,84,315,102,116,276,86,244,300
A[C>A]C,231,48,104,90,93,83,117,105,219,80,...,451,95,60,252,77,82,193,40,219,212
A[C>A]G,32,9,20,10,9,23,25,21,34,26,...,77,29,4,46,21,12,76,50,30,42
A[C>A]T,297,51,89,54,83,100,121,87,220,73,...,587,105,78,206,59,72,222,40,280,228
A[C>G]A,298,15,87,42,45,48,101,74,124,68,...,254,109,36,184,41,46,214,36,240,196


In [24]:
data.head()

Unnamed: 0_level_0,GEL-2343183-11,GEL-2036180-11,GEL-2000678-11,GEL-2722354-11,GEL-2265651-11,GEL-2010990-11,GEL-2684573-11,GEL-2347321-11,GEL-2636707-11,GEL-2587944-11,...,GEL-2158012-11,GEL-2788568-11,GEL-2360348-11,GEL-2955659-11,GEL-2486623-11,GEL-2797572-11,GEL-2250658-11,GEL-2960800-11,GEL-2415629-11,GEL-2596443-11
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A[C>A]A,332,78,130,82,95,139,158,154,229,154,...,578,166,98,311,99,106,298,91,265,280
A[C>A]C,228,58,124,70,89,84,118,106,242,73,...,456,78,63,220,74,84,203,37,222,201
A[C>A]G,27,13,15,15,8,23,25,20,35,20,...,65,32,6,37,25,12,73,49,28,44
A[C>A]T,301,46,100,56,89,91,118,94,193,84,...,587,111,57,212,62,72,238,47,253,243
A[C>G]A,272,19,95,35,58,54,105,71,116,39,...,269,99,34,180,56,47,213,30,223,184
