In [1]:
import pandas as pd 
import os 
import glob

In [None]:
def extract_snp_and_chromosome(folder_path, sep="\t"):
    all_entries = []

    for file in os.listdir(folder_path):
        if file.endswith(".clumps"):
            file_path = os.path.join(folder_path, file)
            try:
                df = pd.read_csv(file_path, sep=sep)
                

                if "ID" in df.columns and "#CHROM" in df.columns:
                    df = df[["ID", "#CHROM"]].copy()
                    df.rename(columns={"#CHROM": "chromosome"}, inplace=True)
                    all_entries.append(df)
                else:
                    print(f"Skipping {file}: Required columns not found.")
            except Exception as e:
                print(f"Error loading {file}: {e}")

    return pd.concat(all_entries, ignore_index=True)

In [None]:
def non_neg_matrix(matrix): 
    
    df_traits_filtered=[]
    z_mat=matrix 

     # Positive component
    z_mat_pos = z_mat.copy()
    z_mat_pos[z_mat_pos < 0] = 0
    z_mat_pos.columns = [f"{col}_pos" for col in z_mat_pos.columns]
        
        # Negative component  
    z_mat_neg = -z_mat.copy()
    z_mat_neg[z_mat_neg < 0] = 0
    z_mat_neg.columns = [f"{col}_neg" for col in z_mat.columns]
        
    final_z_mat = pd.concat([z_mat_pos, z_mat_neg], axis=1)
   
    return final_z_mat

In [None]:
#snp list after pruning 
folder_path="/Users/adama/Downloads/Tchandjieu_Lab/adama_snps_all"
pruned_df=extract_snp_and_chromosome(folder_path, sep="\t")

In [5]:
pruned_df

Unnamed: 0,ID,chromosome
0,15:85383145:C:G,15
1,15:48885877:G:A,15
2,15:84917314:T:C,15
3,15:99269878:T:A,15
4,15:71612514:T:G,15
...,...,...
591,17:8382184:G:T,17
592,17:41837719:G:A,17
593,17:27962393:T:G,17
594,17:57929535:A:G,17


In [None]:
# combining pruned table and the combined gwas summary stats file 
 # has column 'ID'
pruned_df['ID'] = pruned_df['ID'].astype(str)

# Large file
input_file = "/Users/adama/Downloads/Tchandjieu_Lab/NMF_python/raw_data/all_cardiac_traits_raw.csv"  # or .zip
output_file = "pruned_gwas_summary_stats.csv"

chunksize = 100_000
first_write = True

for chunk in pd.read_csv(input_file, chunksize=chunksize):
    # Make sure required columns are there
    required_cols = ["CHROM", "GENPOS", "ALLELE1", "ALLELE0"]
    if not all(col in chunk.columns for col in required_cols):
        continue

    # Convert to string and create full_id_10
    chunk['full_id_10'] = (
        chunk["CHROM"].astype(str) + ":" +
        chunk["GENPOS"].astype(str) + ":" +
        chunk["ALLELE1"].astype(str) + ":" +
        chunk["ALLELE0"].astype(str)
    )

    # Merge with pruned table
    merged = pd.merge(chunk, pruned_df, left_on='full_id_10', right_on='ID', how='inner')

    # Save only if there's something to save
    if not merged.empty:
        merged.to_csv(output_file, mode='a', index=False, header=first_write)
        first_write = False

print("✅ Merging complete. Output saved to:", output_file)

✅ Merging complete. Output saved to: pre_bnmf_merged_output.csv


In [None]:
#load the created data frame, only keep traits with correlation less than 0.8
keep=['AAo_max_area', 'DAo_max_area',
       'WT_AHA_12',
       'WT_AHA_2',  'WT_AHA_6', 
       'atrial_volume_LAEF', 'atrial_volume_LASV', 'atrial_volume_RAEF',
       'atrial_volume_RASV', 'thickness_WT_Global', 'ventrical_volume_LVCO',
       'ventricular_volume_LVEDV', 'ventricular_volume_LVEF',
       'ventricular_volume_LVESV', 'ventricular_volume_LVM',
       'ventricular_volume_LVSV', 'ventricular_volume_RVEDV',
       'ventricular_volume_RVEF', 'ventricular_volume_RVESV',
       'ventricular_volume_RVSV', 'volume_LAV_max',
       'volume_RAV_max']

data=pd.read_csv('/Users/adama/Downloads/Tchandjieu_Lab/NMF_python/pruned_gwas_summary_stats.csv')
#calculate zscore
data['z_score']=data['BETA']/ data['SE']

#pivot df to create the table needed for the matrix 

pivot_df = data.pivot_table(values='z_score', index='ID_y', columns='trait')
#filter for cardiac traits needed
new=pivot_df[keep]
#create non-negative matrix 
final_use=non_neg_matrix(new)

file_name_1='all_2606__bnmf_matrix.csv'
final_use.to_csv(file_name_1)