In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from matplotlib.backends.backend_pdf import PdfPages

In [6]:

def preprocess_data(filename, index_col=0):
    """Read and preprocess the data from CSV file."""
    df = pd.read_csv(filename, index_col=index_col)
    if any(df.columns.str.startswith('abundance')):
        df = df.filter(regex='^abundance')
        df.columns = df.columns.str.replace('abundance.', '')
    return df

def rename_columns(df, rename_dict):
    """Rename DataFrame columns based on the rename dictionary."""
    return df.rename(columns=lambda col: rename_dict.get(col.split('.')[0], col))

def filter_genes(df, genes):
    """Filter DataFrame rows based on gene patterns."""
    gene_pattern = '|'.join(genes)
    return df[df.index.str.contains(gene_pattern)]

def read_files_for_ids(directory, file_id, suffix_1, suffix_2):
    """Read and preprocess data files based on the file ID."""
    file_path_1 = os.path.join(directory, f"{file_id}_{suffix_1}")
    file_path_2 = os.path.join(directory, f"{file_id}_{suffix_2}")
    
    if os.path.isfile(file_path_1) and os.path.isfile(file_path_2):
        df1 = preprocess_data(file_path_1)
        df2 = pd.read_csv(file_path_2, sep="\t")
        
        if file_id.startswith('mouse_'):
            return df1, df2, 'mouse'
        else:
            return df1, df2, 'human'
    else:
        print(f"Files not found for ID {file_id}")
        return None, None, None

def create_plots(df, file_id, rename_dict, output_dir):
    """Create and save boxplot PDFs for each group column in the DataFrame."""
    gene_names = '_'.join(rename_dict.values())
    pdf_path = os.path.join(output_dir, f"{file_id}_{gene_names}_plots.pdf")
    
    with PdfPages(pdf_path) as pdf:
        group_columns = [col for col in df.columns if col.startswith('group_')]
        
        for group_col in group_columns:
            for gene in rename_dict.values():
                if gene in df.columns:
                    df_sub = df[[group_col, gene]]
                    df_melted = pd.melt(df_sub, id_vars=[group_col], var_name='Gene', value_name='Expression')
                    
                    plt.figure(figsize=(12, 6))  # Increased figure size for better space
                    sns.boxplot(x='Gene', y='Expression', hue=group_col, data=df_melted)
                    
                    # Add dots for each sample without duplicating the legend
                    sns.stripplot(x='Gene', y='Expression', hue=group_col, data=df_melted, 
                                  dodge=True, marker='o', alpha=0.6, color='black', legend=False)
                                  
                    plt.title(f"{file_id} - {group_col} - {gene}")
                    plt.ylim(0)
                    
                    # Move legend outside and change orientation to horizontal
                    plt.legend(loc='upper left', bbox_to_anchor=(1, 1), title=group_col, ncol=2, borderpad=1)
                    
                    # Adjust layout to ensure everything fits
                    plt.tight_layout()
                    
                    pdf.savefig()
                    plt.close()
                    
def deseq_output(directory, ids, human_genes, human_rename, mouse_genes, mouse_rename, output_dir):
    for file_id in ids:
        collected_data = []

        # Determine the gene names for the filename
        gene_ids = human_genes if human_genes else mouse_genes
        gene_names = [human_rename.get(g, g) if "ENSG" in g else mouse_rename.get(g, g) for g in gene_ids]

        # Format gene names in filename
        if len(gene_names) > 2:
            gene_name_str = f"{gene_names[0]}_{gene_names[1]}_more"
        else:
            gene_name_str = "_".join(gene_names)

        output_file = os.path.join(output_dir, f"{file_id}_{gene_name_str}_deseq.csv")

        pattern = f"{file_id}_samples."

        for file in os.listdir(directory):
            if file.startswith(pattern):
                file_path = os.path.join(directory, file)
                df = pd.read_csv(file_path)  # Adjust delimiter if needed

                # Determine organism and relevant genes
                genes_to_keep = human_genes if 'ENSG' in df['V1'].iloc[0] else mouse_genes

                # Filter rows based on gene names
                df['V1_clean'] = df['V1'].str.split('.').str[0]  # Remove version suffix
                df_filtered = df[df['V1_clean'].isin(genes_to_keep)].copy()
                if df_filtered.empty:
                    continue

                # Extract comparison name and remove .csv
                comparison = file.split("samples.")[-1].rsplit('.', 1)[0]
                
                # Add metadata columns
                df_filtered["Comparison"] = comparison
                df_filtered["ID"] = file_id
                
                # Collect results
                collected_data.append(df_filtered)

        # Save final output
        if collected_data:
            final_df = pd.concat(collected_data, ignore_index=True)
            final_df.to_csv(output_file, index=False)
            print(f"Saved filtered DESeq2 output to {output_file}")
        else:
            print(f"No matching data found for {file_id}")

In [7]:
def main():
    directory = '/hdd1/projects/bulk_expression/processed'
    output_dir = '/hdd1/projects/bulk_expression/output_plots/'
    #ids = [
    #    'GSE123661', 'GSE130970', 'GSE147304', 'GSE202069', 'SRP165898',
    #    'SRP174668', 'SRP186450', 'SRP217231', 'SRP318203',
    #    'mouse_GSE156052', 'mouse_SRP271293', 
    #    'mouse_SRP336874', 'mouse_PRJNA868929', 'mouse_PRJNA714630', 'mouse_PRJNA940436', 'mouse_PRJNA1019376', 'mouse_GSE229189', 'mouse_GSE218026','mouse_GSE226496','mouse_GSE211105','mouse_GSE243681','mouse_SRP410543','mouse_GSE168937','GSE246421', 'GSE183754','GSE189849'
    #]
    ids = ['mouse_PRJNA1019376','mouse_PRJNA940436','GSE123661']
    
    suffix_1 = 'salmon.csv'
    suffix_2 = 'SraRunTable.txt'
        
    human_genes_to_filter = ['ENSG00000140107', 'ENSG00000213886'] # Add genes here
    human_rename_dict = {'ENSG00000140107': 'SLC25A47', 'ENSG00000213886':'UBD'} # Add rename mappings here
    mouse_genes_to_filter = ['ENSMUSG00000048856','ENSMUSG00000035186'] # Add genes here
    mouse_rename_dict = {'ENSMUSG00000048856': 'SLC25A47', 'ENSMUSG00000035186':'UBD'} # Add rename mappings here

    for file_id in ids:
        df1, df2, organism = read_files_for_ids(directory, file_id, suffix_1, suffix_2)
            
        if df1 is not None and df2 is not None:
            if organism == 'human':
                genes_to_filter = human_genes_to_filter
                rename_dict = human_rename_dict
            else:
                genes_to_filter = mouse_genes_to_filter
                rename_dict = mouse_rename_dict
            df1 = filter_genes(df1, genes_to_filter)
            df1_t = df1.T
            df1_t = rename_columns(df1_t, rename_dict)

            df_w_group = df2.merge(df1_t, right_index=True, left_on='Run', how='inner')
            create_plots(df_w_group, file_id, rename_dict, output_dir)
    # Run DESeq output processing
    deseq_output(directory, ids, human_genes_to_filter, human_rename_dict, mouse_genes_to_filter, mouse_rename_dict, output_dir)

if __name__ == "__main__":
    main()


Setting a gradient palette using color= is deprecated and will be removed in v0.14.0. Set `palette='dark:black'` for the same effect.

  sns.stripplot(x='Gene', y='Expression', hue=group_col, data=df_melted,

Setting a gradient palette using color= is deprecated and will be removed in v0.14.0. Set `palette='dark:black'` for the same effect.

  sns.stripplot(x='Gene', y='Expression', hue=group_col, data=df_melted,

Setting a gradient palette using color= is deprecated and will be removed in v0.14.0. Set `palette='dark:black'` for the same effect.

  sns.stripplot(x='Gene', y='Expression', hue=group_col, data=df_melted,

Setting a gradient palette using color= is deprecated and will be removed in v0.14.0. Set `palette='dark:black'` for the same effect.

  sns.stripplot(x='Gene', y='Expression', hue=group_col, data=df_melted,

Setting a gradient palette using color= is deprecated and will be removed in v0.14.0. Set `palette='dark:black'` for the same effect.

  sns.stripplot(x='Gene', y=

Saved filtered DESeq2 output to /hdd1/projects/bulk_expression/output_plots/mouse_PRJNA1019376_SLC25A47_UBD_deseq.csv
Saved filtered DESeq2 output to /hdd1/projects/bulk_expression/output_plots/mouse_PRJNA940436_SLC25A47_UBD_deseq.csv
Saved filtered DESeq2 output to /hdd1/projects/bulk_expression/output_plots/GSE123661_SLC25A47_UBD_deseq.csv
