# Merging ATAC-Seq raw counts from Gabriel, Corces cancer and hepatocyte samples

## Import required libraries

In [None]:
import pandas as pd
import re
import glob
import os

# Load the raw count matrix

In [None]:
# Load raw counts
raw_df = pd.read_csv(
    "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/gabriel/markers_identification_input_files/raw_counts.txt",
    sep='\t', header=0, index_col=None
)
raw_df.reset_index(inplace=True)
raw_df.rename(columns={"index": "region"}, inplace=True)

In [None]:
print(raw_df)

# Prepare chrom, start, end for merging

In [None]:
# Extract chrom, start, end
region_pattern = r'^(chr[^:]+):(\d+)-(\d+)$'
raw_df[['chrom', 'start', 'end']] = raw_df['region'].str.extract(region_pattern)

In [None]:
raw_df['start'] = raw_df['start'].astype(int)
raw_df['end'] = raw_df['end'].astype(int)

In [None]:
# Put chrom, start, end at the front
desired_order = ['chrom', 'start', 'end'] + [col for col in raw_df.columns if col not in ['chrom', 'start', 'end', 'region']]
raw_df = raw_df[desired_order]

In [None]:
# Load metadata
meta_df = pd.read_csv('/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/gabriel/markers_identification_input_files/metadata.txt', sep='\t')

In [None]:
# Combine subgroups into main groups
meta_df['groups'] = meta_df['groups'].replace({
    'Naive_CD8_Tcells': 'CD8_Tcells',
    'Non_Naive_CD8_Tcells': 'CD8_Tcells',
    'Naive_CD4_Tcells': 'CD4_Tcells',
    'Non_Naive_CD4_Tcells': 'CD4_Tcells',
    'Tregs': 'CD4_Tcells'
})

In [None]:
# Create a mapping from sample to group and append the sample name
meta_df['group_with_sample'] = meta_df['groups'] + '_' + meta_df['sample']
sample_to_group_with_sample = meta_df.set_index('sample')['group_with_sample'].to_dict()

In [None]:
# Rename columns in raw_df
mapped_columns = {}
for col in raw_df.columns:
    if col in sample_to_group_with_sample:
        mapped_columns[col] = sample_to_group_with_sample[col]

raw_df.rename(columns=mapped_columns, inplace=True)

In [None]:
print(raw_df.head())

In [None]:
print(raw_df.describe())

# Load the raw hepatocyte counts

In [None]:
# Define directory containing hepatocyte read count files
hepatocyte_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/subset_trimmed_data_reference_hepa/"

In [None]:
# Get all hepatocyte read count CSV files
hepatocyte_files = glob.glob(os.path.join(hepatocyte_dir, "hepatocytes_*.csv"))

In [None]:
# Initialize list to store individual dataframes
hepatocyte_data = []

# Loop through each file and load
for file in hepatocyte_files:
    # Extract SRR ID from filename
    match = re.search(r'(SRR\d+)', file)
    if match:
        srr_id = match.group(1)
        sample_name = f"hepatocytes_{srr_id}" 

        # Load the file
        df = pd.read_csv(file)

        # Rename the last column 
        df = df.rename(columns={df.columns[-1]: sample_name})

        # Append dataframe to list
        hepatocyte_data.append(df)

## Merge files based on chrom, start and end

In [None]:
# Merge all DataFrames based on 'chrom', 'start', and 'end' columns
merged_hepatocyte_df = pd.concat(hepatocyte_data, axis=1).loc[:, ~pd.concat(hepatocyte_data, axis=1).columns.duplicated()]

# Print summary statistics
print(merged_hepatocyte_df.describe())

In [None]:
# Save the dataframe to a CSV file
# hepatocyte_df.to_csv("hepatocyte_data.csv", index=False)

## Merge hepatocytes with the raw count matrix

In [None]:
# Merge raw count matrix with hepatocyte data
combined_df = pd.merge(
    raw_df,  
    merged_hepatocyte_df,  
    on=['chrom', 'start', 'end'],  
    how='outer'  
)

# Ensure chrom, start, end are at the front of the DataFrame
final_columns = ['chrom', 'start', 'end'] + [col for col in combined_df.columns if col not in ['chrom', 'start', 'end']]
combined_df = combined_df[final_columns]

In [None]:
# Display the final DataFrame
print(combined_df)

## Load corces cancer samples

In [None]:
# Path to the parent directory containing cancer folders
parent_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/corces/"

In [None]:
# List of cancer type folders to process
cancer_folders = ["COAD_per_sample_scores", "BRCA_per_sample_scores", 
                  "LUAD_per_sample_scores", "LUSC_per_sample_scores"]

In [None]:
cancer_sample = pd.read_csv("/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/corces/COAD_per_sample_scores/COAD_0B139EBC_D372_4EC3_90DB_4CC9BC6F38DC_X006_S05_merged_weighted_scores.csv")

In [None]:
# print(cancer_sample)

In [None]:
# List to store data for all samples
all_cancer_data = []

In [None]:
# Loop through each folder and load files
for folder in cancer_folders:
    folder_path = os.path.join(parent_dir, folder)
    files = glob.glob(f"{folder_path}/*.csv")

    for file in files:
        # Extract sample name
        sample_name = os.path.basename(file).replace("_merged_weighted_scores.csv", "")
        
        # Load data
        df = pd.read_csv(file)

        # Rename columns to match hepatocyte samples
        df = df.rename(columns={"Chromosome": "chrom", "Start": "start", "End": "end"})

        # Retain only the necessary columns
        df = df[['chrom', 'start', 'end', df.columns[-1]]]
        df = df.rename(columns={df.columns[-1]: sample_name})  
        
        # Append to list
        all_cancer_data.append(df)

## Merge corces cancer samples with raw count matrix and hepatocytes

In [None]:
# Merge all DataFrames on 'chrom', 'start', 'end'
combined_cancer_df = all_cancer_data[0]
for df in all_cancer_data[1:]:
    combined_cancer_df = pd.merge(combined_cancer_df, df, on=['chrom', 'start', 'end'], how='outer')

In [None]:
# print(combined_cancer_df)

In [None]:
print(combined_cancer_df.describe())

In [None]:
# Save the dataframe to a CSV file
# combined_cancer_df.to_csv("combined_cancer_df.csv", index=False)

In [None]:
# Merge raw count matrix with other data
all_combined_df = pd.merge(
    combined_df,  
    combined_cancer_df,  
    on=['chrom', 'start', 'end'],  
    how='outer' 
)

# Ensure chrom, start, end are at the front of the DataFrame
final_columns = ['chrom', 'start', 'end'] + [col for col in all_combined_df.columns if col not in ['chrom', 'start', 'end']]
all_combined_df = all_combined_df[final_columns]

In [None]:
print(all_combined_df)

# Remove the monocytes from the combined dataset

In [None]:
# Remove all columns containing Monocytes
filtered_df = all_combined_df.loc[:, ~all_combined_df.columns.str.contains("Monocytes")]

# Print the updated DataFrame structure
print(filtered_df.shape)  
print(filtered_df.head())  

# Save merged raw count matrix

In [None]:
# Save DataFrame to CSV
filtered_df.to_csv("/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/raw_counts_matrix.csv", index=False)