### calculate a module score and comapare

In [2]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.decomposition import PCA

In [6]:
###########################################################################
# CREATE RNA MODULES "module_scores_model_input_all_self_made"
###########################################################################

def create_RNa_module_scores(read_count_path, module_scores_path, output_path_path):

    # Load data
    read_counts = pd.read_csv(read_count_path, index_col=0)  # Patients as rows, genes as columns
    module_list = pd.read_csv(module_scores_path)  # Contains columns 'Module', 'Gene', 'Function'
    
    # Ensure columns are strings for consistency
    read_counts.columns = read_counts.columns.astype(str)
    
    module_scores = []

    for module in module_list['Module'].unique():
        # Get the module's function
        module_function = module_list.loc[module_list['Module'] == module, 'Function'].iloc[0]
        module_genes = module_list[module_list['Module'] == module]['Gene']
        common_genes = [gene for gene in module_genes if gene in read_counts.columns]
    
        if not common_genes:
            print(f"No overlapping genes found for module {module}. Skipping.")
            continue
    
        # Subset read_counts for the genes in this module
        module_data = read_counts[common_genes]
        
        # Standardize the data
        scaler = StandardScaler()
        standardized_data = scaler.fit_transform(module_data)
        
        # Perform 1D PCA
        pca = PCA(n_components=1)
        pca_scores = pca.fit_transform(standardized_data)
        
        # Store results with module name and function
        module_scores.append(pd.DataFrame(
            pca_scores,
            index=read_counts.index,
            columns=[f"{module}.{module_function}"]
        ))
    
    # Combine all module scores into a single DataFrame
    module_scores_df = pd.concat(module_scores, axis=1)
    module_scores_df = module_scores_df.copy()
    
    # ---- pivoting logic -----
    
    # Extract 'Vaccinee' and 'TimePoint' from the index
    module_scores_df['TimePoint'] = module_scores_df.index.str.extract(r'_EXP(\d+)$', expand=False)
    module_scores_df['Vaccinee'] = module_scores_df.index.str.replace(r'_EXP\d+$', '', regex=True)
    
    # Reshape the data from long to wide format
    module_scores_df.reset_index(drop=True, inplace=True)  # Remove the old index
    wide_format = module_scores_df.pivot(index='Vaccinee', columns='TimePoint')
    
    # Flatten the MultiIndex in columns
    wide_format.columns = [
        f"{col[0]}_EXP{col[1]}" for col in wide_format.columns
    ]
    
    # Reset index to make 'Vaccinee' a column
    wide_format.reset_index(inplace=True)
    
    # Save the transformed data
    wide_format.to_csv(output_path_path, index=False)
    print(f"Transformed data saved as '{output_path_path}'")

In [3]:
create_RNa_module_scores(read_count_path = "../data/Measles/processed_readcounts_collapsed.csv", 
                         module_scores_path= "../data/Measles/module_list.csv", 
                         output_path_path= "../data/Measles/module_scores_model_input_all_self_made.csv")

rna_data = pd.read_csv("../data/Measles/module_scores_model_input_all_self_made.csv")
rna_data

Transformed data saved as '../data/Measles/module_scores_model_input_all_self_made.csv'


Unnamed: 0,Vaccinee,M3.1.Cell cycle_EXP0,M3.1.Cell cycle_EXP3,M3.1.Cell cycle_EXP7,M8.1.TBD_EXP0,M8.1.TBD_EXP3,M8.1.TBD_EXP7,M8.2.Prostanoids_EXP0,M8.2.Prostanoids_EXP3,M8.2.Prostanoids_EXP7,...,M16.108.TBD_EXP7,M16.109.Platelet_EXP0,M16.109.Platelet_EXP3,M16.109.Platelet_EXP7,M16.110.TBD_EXP0,M16.110.TBD_EXP3,M16.110.TBD_EXP7,M16.111.TBD_EXP0,M16.111.TBD_EXP3,M16.111.TBD_EXP7
0,M1,-2.522262,0.881673,-5.404821,-0.48421,-3.720734,-1.55702,-3.907561,-3.757232,-3.63005,...,-0.450383,-2.25647,-2.491798,-1.975561,-2.598271,-0.918194,-3.674222,2.039189,1.810428,1.898279
1,M11,-4.182731,-1.459013,-4.429286,-0.059521,-2.638241,-4.346022,1.248508,5.946061,0.007107,...,0.619234,0.846232,3.890965,0.147093,-0.246082,-1.611822,-2.048544,0.949527,1.027765,2.90677
2,M12,1.397387,5.71236,2.434849,-2.213685,-0.776933,0.155515,-2.496329,-2.185208,-4.935804,...,-0.597422,-1.200718,-0.831733,-2.063667,0.446297,2.945176,1.040333,0.677244,-2.051895,-0.895161
3,M13,2.388776,3.126155,7.933149,0.363241,-0.961904,0.017693,1.128747,2.880515,6.966034,...,-1.649004,0.649138,1.45053,3.699781,0.765366,0.47369,1.50031,-0.54871,-0.248264,-0.77681
4,M14,-2.284118,4.20683,4.700574,0.821419,0.139564,-0.273377,-2.012061,-0.164194,-1.457068,...,-0.715638,-1.598598,-0.527154,-0.966508,-1.674525,2.67561,2.292286,0.973271,-0.949314,-1.071366
5,M15,-1.137537,-3.612768,1.371846,-4.069802,0.098081,1.520127,0.878996,-1.24481,4.099065,...,1.62447,2.072852,-0.091485,4.374526,-1.765087,-0.473182,1.143304,1.397377,1.253369,-0.063493
6,M16,-0.902596,3.414276,2.025397,-3.684126,1.907031,0.854278,3.684257,6.938477,3.915958,...,-0.933602,1.634019,3.626563,2.380808,-3.468656,0.170367,0.098171,2.882787,-0.186265,0.023479
7,M17,-3.749819,1.055172,-4.17214,-2.77859,-0.752138,-3.823554,-1.732554,-0.711434,-2.114882,...,3.685237,-1.119589,-0.278757,-0.857087,-2.744048,-0.538845,-3.047758,2.72567,0.391222,1.963925
8,M18,3.508123,2.043994,-5.782424,0.401817,0.329645,-1.245246,-1.88464,-2.127251,-6.273425,...,2.028009,-1.350256,0.571715,-2.75366,2.054978,1.576613,-3.108876,-1.022834,-0.30423,1.250474
9,M19,-3.393572,-0.009168,4.860194,-0.744129,0.160436,-0.955246,-2.673159,1.184812,5.193388,...,-1.13573,-1.522689,0.363375,3.57433,-0.114569,0.445636,2.648063,2.193562,0.425512,-0.271096


In [4]:
def gen_rnaseq_metadata_csv():
    # Read the readcounts file
    readcounts = pd.read_csv('../data/Hepatitis B/readcounts.csv')
    sample_names = list(readcounts.columns)
    print("Sample names:", sample_names)
    
    # Read the metadata file with patient info (must include columns "Vaccinee", "Gender", "Age")
    meta_ab = pd.read_csv('../data/Hepatitis B/Meta.csv')
    
    # Create a lookup dictionary for Gender and Age keyed by the patient/sample ID (Vaccinee)
    meta_info = meta_ab.drop_duplicates(subset="Vaccinee")\
                      .set_index('Vaccinee')[['Gender', 'Age']]\
                      .to_dict('index')
    
    rows = []
    id = 0
    # Loop over each sample name and parse the fields
    for s in sample_names:
        # Expected format: "H10_EXP0_1_S22"
        parts = s.split('_')
        if len(parts) < 4:
            continue  # skip if the format doesn't match the expectation
        
        sample_id = parts[0]              # e.g., "H10"
        day_str    = parts[1]              # e.g., "EXP0"
        replicate  = parts[2]              # e.g., "1"
        
        # Remove the "EXP" prefix to extract the day and convert to integer
        day = int(day_str.replace('EXP', ''))
        
        run = id
        id += 1
        
        # Lookup Gender and Age using the sample_id from meta_info, if available
        if sample_id in meta_info:
            gender = meta_info[sample_id]['Gender']
            age = meta_info[sample_id]['Age']
        else:
            gender = None
            age = None
        
        # Create a dictionary for the current sample
        row_dict = {
            'run': run,
            'name': s,
            'replicates': replicate,
            'sample': sample_id,
            'day': day,
            'Gender': gender,
            'Age': age
        }
        rows.append(row_dict)
    
    # Convert the list of dictionaries into a DataFrame
    rnaseq_metadata = pd.DataFrame(rows)
    
    # Write out the new metadata file to CSV
    output_path = '../data/Hepatitis B/rnaseq_metadata.csv'
    rnaseq_metadata.to_csv(output_path, index=False)
    print("rnaseq_metadata.csv file written to:", output_path)
            
gen_rnaseq_metadata_csv()

Sample names: ['genename', 'H10_EXP0_1_S22', 'H10_EXP0_2_S22', 'H10_EXP3_1_S23', 'H10_EXP3_2_S23', 'H10_EXP7_1_S24', 'H10_EXP7_2_S24', 'H11_EXP0_1_S25', 'H11_EXP0_2_S25', 'H11_EXP3_1_S26', 'H11_EXP3_2_S26', 'H11_EXP7_1_S27', 'H11_EXP7_2_S27', 'H13_EXP0_1_S31', 'H13_EXP0_2_S31', 'H13_EXP3_1_S32', 'H13_EXP3_2_S32', 'H13_EXP7_1_S33', 'H13_EXP7_2_S33', 'H14_EXP0_1_S34', 'H14_EXP0_2_S34', 'H14_EXP3_1_S35', 'H14_EXP3_2_S35', 'H14_EXP7_1_S36', 'H14_EXP7_2_S36', 'H17_EXP0_1_S37', 'H17_EXP0_2_S37', 'H17_EXP3_1_S38', 'H17_EXP3_2_S38', 'H17_EXP7_1_S39', 'H17_EXP7_2_S39', 'H18_EXP0_1_S40', 'H18_EXP0_2_S40', 'H18_EXP3_1_S1', 'H18_EXP3_2_S1', 'H18_EXP7_1_S2', 'H18_EXP7_2_S2', 'H19_EXP0_1_S3', 'H19_EXP0_2_S3', 'H19_EXP3_1_S4', 'H19_EXP3_2_S4', 'H19_EXP7_1_S5', 'H19_EXP7_2_S5', 'H20_EXP0_1_S6', 'H20_EXP0_2_S6', 'H20_EXP3_1_S7', 'H20_EXP3_2_S7', 'H20_EXP7_1_S8', 'H20_EXP7_2_S8', 'H21_EXP0_1_S9', 'H21_EXP0_2_S9', 'H21_EXP3_1_S10', 'H21_EXP3_2_S10', 'H21_EXP7_1_S11', 'H21_EXP7_2_S11', 'H22_EXP0_1_S12', '

In [7]:
create_RNa_module_scores(read_count_path = "../data/Hepatitis B/processed_readcounts_collapsed.csv", 
                         module_scores_path= "../data/Hepatitis B/module_list.csv", 
                         output_path_path= "../data/Hepatitis B/module_scores_model_input_all_self_made.csv")

rna_data = pd.read_csv("../data/Hepatitis B/module_scores_model_input_all_self_made.csv")
rna_data

KeyboardInterrupt: 