In [4]:
import numpy as np
import pandas as pd
import glob
import os
from scipy.stats import pearsonr

In [6]:
# Load sample data and create mapping
id_to_type_path = r"/mnt/c/Users/donna/Downloads/Thesis/TSSClassification/brca_correlaties/id_to_type.xlsx"
sample_data = pd.read_excel(id_to_type_path, header=0, sheet_name='Sheet1', dtype='object')
sample_data.columns = sample_data.columns.str.strip()  # Strip any extra spaces from column names

In [7]:
#Filter for sample data to only cntains ids of BRCA and COAD
filtered_data = sample_data[sample_data.iloc[:, 1].isin(['BRCA', 'COAD'])]

# Create dictionary with sample id as keys and cancer types as values
sample_to_type = dict(zip(filtered_data.iloc[:, 0].str.strip(), filtered_data.iloc[:, 1].str.strip()))

# Making different containers for the sample ids for each cancer type
brca_samples = [sample for sample, cancer_type in sample_to_type.items() if cancer_type == 'BRCA']
coad_samples = [sample for sample, cancer_type in sample_to_type.items() if cancer_type == 'COAD']

print(brca_samples)

['TCGA-3C-AALJ', 'TCGA-4H-AAAK', 'TCGA-A2-A0CX', 'TCGA-A2-A0ES', 'TCGA-A2-A0ET', 'TCGA-A2-A0EV', 'TCGA-A2-A0EW', 'TCGA-A2-A0EX', 'TCGA-A2-A0EY', 'TCGA-A2-A0SV', 'TCGA-A2-A0SW', 'TCGA-A2-A0SX', 'TCGA-A2-A0T4', 'TCGA-A2-A0T5', 'TCGA-A2-A0T6', 'TCGA-A2-A0T7', 'TCGA-A2-A0YC', 'TCGA-A2-A0YD', 'TCGA-A2-A0YF', 'TCGA-A2-A0YG', 'TCGA-A2-A0YH', 'TCGA-A2-A0YJ', 'TCGA-A2-A0YK', 'TCGA-A2-A0YL', 'TCGA-A2-A0YT', 'TCGA-A2-A4RX', 'TCGA-A7-A0CH', 'TCGA-A7-A0D9', 'TCGA-A7-A13E', 'TCGA-A7-A13F', 'TCGA-A8-A06N', 'TCGA-A8-A08J', 'TCGA-A8-A094', 'TCGA-AO-A03L', 'TCGA-AO-A03N', 'TCGA-AO-A0J5', 'TCGA-AO-A0J8', 'TCGA-AO-A0JB', 'TCGA-AO-A0JG', 'TCGA-AO-A0JM', 'TCGA-AO-A124', 'TCGA-AO-A12D', 'TCGA-AO-A12F', 'TCGA-AQ-A04L', 'TCGA-AR-A0TP', 'TCGA-AR-A0TV', 'TCGA-AR-A0U0', 'TCGA-AR-A0U4', 'TCGA-BH-A0B1', 'TCGA-BH-A0B5', 'TCGA-BH-A0BA', 'TCGA-BH-A0BZ', 'TCGA-BH-A0C1', 'TCGA-BH-A0DL', 'TCGA-BH-A0DP', 'TCGA-BH-A0DV', 'TCGA-BH-A0E0', 'TCGA-BH-A0EE', 'TCGA-BH-A0HP', 'TCGA-BH-A1EV', 'TCGA-C8-A12K', 'TCGA-C8-A12M', 'TCGA-C

In [None]:
# Define a function to extract the first three parts of a sample ID
def get_first_three_parts(sample_id):
    return '-'.join(sample_id.split('-')[:3])

In [None]:
# Path to the large data matrix
file_path = r"/mnt/c/Users/donna/Downloads/Thesis/brca_correlaties/TCGA_ATAC_peak_values"

In [None]:
# Load the first line of the large matrix to get the full list of sample IDs (header only)
with open(file_path, 'r') as f:
    header_line = f.readline().strip()

# Extract sample IDs from the header (skip the first column which is 'sample')
all_sample_ids = header_line.split('\t')[1:]  # The first column is 'sample', so we skip it

# Filter the sample IDs to include only those that match the first three parts of BRCA sample IDs
brca_columns = [sample_id for sample_id in all_sample_ids if get_first_three_parts(sample_id) in brca_samples]
coad_columns = [sample_id for sample_id in all_sample_ids if get_first_three_parts(sample_id) in coad_samples]

# Load only the necessary columns (BRCA samples) and limit to the first 1000 rows
brca_1000_peak_values = pd.read_csv(file_path, sep='\t', usecols=brca_columns,nrows=999, dtype='object')
print(brca_1000_peak_values[:10])
coad_1000_peak_values = pd.read_csv(file_path, sep='\t', usecols=coad_columns,nrows=999, dtype='object')

# Load full peak map from 1 sample of each type
brca_full_single = pd.read_csv(file_path, sep='\t', usecols = [brca_columns[0]], dtype='object')
coad_full_single = pd.read_csv(file_path, sep='\t', usecols = [coad_columns[0]], dtype='object')

# Save the filtered matrix to a CSV file
brca_1000_peak_values.to_csv(r"/mnt/c/Users/donna/Downloads/Thesis/brca_correlaties/brca_1000_peak_values.csv", index=False)
coad_1000_peak_values.to_csv(r"/mnt/c/Users/donna/Downloads/Thesis/brca_correlaties/coad_1000_peak_values.csv", index=False)
brca_full_single.to_csv(r"/mnt/c/Users/donna/Downloads/Thesis/brca_correlaties/brca_full_single.csv", index=False)
coad_full_single.to_csv(r"/mnt/c/Users/donna/Downloads/Thesis/brca_correlaties/coad_full_single.csv", index=False)