In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from matplotlib.backends.backend_pdf import PdfPages

In [2]:
def preprocess_data(filename, index_col=0):
    """Read and preprocess the data from CSV file."""
    df = pd.read_csv(filename, index_col=index_col)
    if any(df.columns.str.startswith('abundance')):
        df = df.filter(regex='^abundance')
        df.columns = df.columns.str.replace('abundance.', '')
    return df

def rename_columns(df, rename_dict):
    """Rename DataFrame columns based on the rename dictionary."""
    return df.rename(columns=lambda col: rename_dict.get(col.split('.')[0], col))

def filter_genes(df, genes):
    """Filter DataFrame rows based on gene patterns."""
    gene_pattern = '|'.join(genes)
    return df[df.index.str.contains(gene_pattern)]

def read_files_for_ids(directory, file_id, suffix_1, suffix_2):
    """Read and preprocess data files based on the file ID."""
    file_path_1 = os.path.join(directory, f"{file_id}_{suffix_1}")
    file_path_2 = os.path.join(directory, f"{file_id}_{suffix_2}")
    
    if os.path.isfile(file_path_1) and os.path.isfile(file_path_2):
        df1 = preprocess_data(file_path_1)
        df2 = pd.read_csv(file_path_2, sep="\t")
        
        if file_id.startswith('mouse_'):
            return df1, df2, 'mouse'
        else:
            return df1, df2, 'human'
    else:
        print(f"Files not found for ID {file_id}")
        return None, None, None, None

def create_plots(df, file_id, rename_dict, output_dir):
    """Create and save boxplot PDFs for each group column in the DataFrame."""
    gene_names = '_'.join(rename_dict.values())
    pdf_path = os.path.join(output_dir, f"{file_id}_{gene_names}_plots.pdf")
    with PdfPages(pdf_path) as pdf:
        group_columns = [col for col in df.columns if col.startswith('group_')]
        
        for group_col in group_columns:
            df_sub = df[[group_col,'RUBCN']]
            df_melted = pd.melt(df_sub, id_vars=[group_col], var_name='Gene', value_name='Expression')
            plt.figure(figsize=(10, 6))
            sns.boxplot(x='Gene', y='Expression', hue=group_col, data=df_melted)
            plt.title(f"{file_id} - {group_col}")
            plt.ylim(0)
            pdf.savefig()
            plt.close()


In [3]:
def main():
    directory = '/hdd1/projects/bulk_expression/processed'
    output_dir = '/hdd1/projects/bulk_expression/output_plots/'
    ids = [
            'GSE123661', 'GSE130970', 'GSE147304', 'GSE202069', 'SRP165898',
            'SRP174668', 'SRP186450', 'SRP217231', 'SRP318203',
            'mouse_GSE156052', 'mouse_GSE156052', 'mouse_SRP271293', 
            'mouse_SRP336874', 'mouse_SRP336874'
        ]
    suffix_1 = 'salmon.csv'
    suffix_2 = 'SraRunTable.txt'
        
    human_genes_to_filter = ['ENSG00000145016']
    human_rename_dict = {'ENSG00000145016': 'RUBCN'}
    mouse_genes_to_filter = ['ENSMUSG00000035629']
    mouse_rename_dict = {'ENSMUSG00000035629': 'RUBCN'}

    for file_id in ids:
        df1, df2, organism = read_files_for_ids(directory, file_id, suffix_1, suffix_2)
            
        if df1 is not None and df2 is not None:
            if organism=='human':
                genes_to_filter = human_genes_to_filter
                rename_dict = human_rename_dict
            else:
                genes_to_filter = mouse_genes_to_filter
                rename_dict = mouse_rename_dict
            df1 = filter_genes(df1, genes_to_filter)
            df1_t = df1.T
            df1_t = rename_columns(df1_t, rename_dict)

            df_w_group = df2.merge(df1_t, right_index=True, left_on='Run', how='inner')
            create_plots(df_w_group, file_id, rename_dict, output_dir)

if __name__ == "__main__":
    main()