# Pathway analysis

In [None]:
import os, sys, glob, re, math, pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from mpl_toolkits.axes_grid1 import make_axes_locatable
import csv
import gseapy
pfp = '/vast/palmer/pi/lim_janghoo/cl2292/SCA1_OL/results/'


# settings
plt.rc('font', size = 9)
plt.rc('font', family='sans serif')
plt.rcParams['pdf.fonttype']=42
plt.rcParams['ps.fonttype']=42
plt.rcParams['text.usetex']=False
plt.rcParams['legend.frameon']=False
plt.rcParams['axes.grid']=False
plt.rcParams['legend.markerscale']=0.5
plt.rcParams['savefig.dpi']=600
sns.set_style("ticks")


def filterdeg(df, ctype, timepoint=None):
    if 'Cell type' not in df.columns or 'Gene' not in df.columns:
        raise ValueError("DataFrame must contain 'Cell type' and 'Gene' columns.")
    
    if timepoint is None:
        filtered_df = df.loc[df['Cell type'] == ctype, :]
    else:
        if 'timepoint' not in df.columns:
            raise ValueError("DataFrame must contain 'timepoint' column for filtering by timepoint.")
        filtered_df = df.loc[(df['Cell type'] == ctype) & (df['timepoint'] == timepoint), :]

    return filtered_df['Gene'].to_list()

def enrichr(genes, title = 'Title',geneset = 'GO_Biological_Process_2023', save = None):
    genes = genes
    res=gseapy.enrichr(gene_list=genes, organism = 'Mouse', gene_sets = geneset, cutoff=0.05)
    df = res.res2d[res.res2d['Adjusted P-value']<0.05]
    df = df.sort_values(by=['Combined Score'], ascending = False).head(10)
    df['-log10(Adjusted P-value)'] = df['Adjusted P-value'].apply(np.log10)*-1
    df['Term'] = df['Term'].apply(lambda x: x[:-13]) #Remove '(GO:xxxxxxxxx)'

    # Data
    GO_biological_processes = df['Term'].to_list()
    Fold_Enrichment = df['Combined Score'].to_list()
    bar_colors = df['-log10(Adjusted P-value)'].to_list()

    # Check if bar_colors is empty, and skip plotting if it is
    if not bar_colors:
        print("No significant GO terms to plot.")
        return
    
    # Create figure and axis
    fig, ax = plt.subplots(1,1, figsize = (2,2))

    # Create horizontal bars
    bars = ax.barh(GO_biological_processes, Fold_Enrichment, color='gray')

    # Create a ScalarMappable for color mapping
    sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(vmin=min(bar_colors), vmax=max(bar_colors)))

    # Convert p-values to colors using the colormap
    colors = [sm.to_rgba(p_value) for p_value in bar_colors]

    # Set colors for the bars
    for bar, color in zip(bars, colors):
        bar.set_color(color)

    # Create a divider to make room for the colorbar
    divider = make_axes_locatable(ax)

    # Append an axis for the colorbar on the right side of the main plot
    cax = divider.append_axes("right", size="5%", pad=0.05)

    # Add a colorbar to the plot
    cbar = plt.colorbar(sm, cax=cax)
    cbar.set_label('-log10(Adjusted P-value)')

    # Set labels and title
    ax.set_xlabel('Combined Score')
    ax.set_ylabel('GO Biological Process')
    ax.set_title(title)

    # Adjust x tick labels font size
    plt.xticks(fontsize=8)

    if save is None:
        plt.show()
    else:
        fig.savefig(os.path.join(pfp,save),dpi=300, bbox_inches='tight')
        plt.show()

def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3    


dge = pd.read_csv('/vast/palmer/pi/lim_janghoo/cl2292/SCA1_OL/results/250414_dge_SCA1-flwCre vs SCA1-flwoCre.csv')

downsig = dge.loc[(dge['emd']<-0.1)&(dge['pval_corrected']<0.01),:]
upsig = dge.loc[(dge['emd']>0.1)&(dge['pval_corrected']<0.01),:]


In [None]:
#sort by -log(adj_p)
def enrichr2(genes, title = 'Title',geneset = 'GO_Biological_Process_2023', save = None):
    genes = genes
    res=gseapy.enrichr(gene_list=genes, organism = 'Mouse', gene_sets = geneset, cutoff=0.05)
    df = res.res2d[res.res2d['Adjusted P-value']<0.05]
    df['-log10(Adjusted P-value)'] = df['Adjusted P-value'].apply(np.log10)*-1
    df = df.sort_values(by=['-log10(Adjusted P-value)'], ascending = False).head(10)
    df['Term'] = df['Term'].apply(lambda x: x[:-13]) #Remove '(GO:xxxxxxxxx)'

    # Data
    GO_biological_processes = df['Term'].to_list()
    Fold_Enrichment = df['-log10(Adjusted P-value)'].to_list()
    bar_colors = df['Combined Score'].to_list()

    # Check if bar_colors is empty, and skip plotting if it is
    if not bar_colors:
        print("No significant GO terms to plot.")
        return
    
    # Create figure and axis
    fig, ax = plt.subplots(1,1, figsize = (2,2))

    # Create horizontal bars
    bars = ax.barh(GO_biological_processes, Fold_Enrichment, color='gray')

    # Create a ScalarMappable for color mapping
    sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(vmin=min(bar_colors), vmax=max(bar_colors)))

    # Convert p-values to colors using the colormap
    colors = [sm.to_rgba(p_value) for p_value in bar_colors]

    # Set colors for the bars
    for bar, color in zip(bars, colors):
        bar.set_color(color)

    # Create a divider to make room for the colorbar
    divider = make_axes_locatable(ax)

    # Append an axis for the colorbar on the right side of the main plot
    cax = divider.append_axes("right", size="5%", pad=0.05)

    # Add a colorbar to the plot
    cbar = plt.colorbar(sm, cax=cax)
    cbar.set_label('Combined Score')

    # Set labels and title
    ax.set_xlabel('-log10(Adjusted P-value)')
    ax.set_ylabel('GO Biological Process')
    ax.set_title(title)

    # Adjust x tick labels font size
    plt.xticks(fontsize=8)

    if save is None:
        plt.show()
    else:
        fig.savefig(os.path.join(pfp,save),dpi=300, bbox_inches='tight')
        plt.show()

In [None]:
## PC 
geneset = 'GO_Cellular_Component_2025'

c_list = ['PC']
for c in c_list:
    for t in ['30wk',]:
        cKIdown = set(filterdeg(downsig, ctype=c, timepoint=t))
        cKIup = set(filterdeg(upsig, ctype=c, timepoint=t))

#         enrichr(list(cKIdown), title = c+', '+t+' down cKI',
#                 geneset = geneset, 
#                 save = '250415_enrichR_'+c+'_'+t+'_cKI down_CC.pdf'
#                )
        
        enrichr2(list(cKIup), title = c+', '+t+' up cKI',
                geneset = geneset,
                save = '250415_enrichR_'+c+'_'+t+'_cKI up_CC_sortbyp.pdf'
               )

    

In [None]:
## PC 
geneset = 'GO_Biological_Process_2025'

c_list = ['OPC','OL']
for c in c_list:
    for t in ['30wk',]:
        cKIdown = set(filterdeg(downsig, ctype=c, timepoint=t))
        cKIup = set(filterdeg(upsig, ctype=c, timepoint=t))

        enrichr(list(cKIdown), title = c+', '+t+' down cKI',
                geneset = geneset, 
                save = '250415_enrichR_'+c+'_'+t+'_cKI down_BP.pdf'
               )
        
        enrichr(list(cKIup), title = c+', '+t+' up cKI',
                geneset = geneset,
                save = '250415_enrichR_'+c+'_'+t+'_cKI up_BP.pdf'
               )

    

In [None]:
## Biological Process sorted by p-value

geneset = 'GO_Biological_Process_2025'
sort_by = 'p_value'
# downsig = cKIdown
# upsig = cKIup

c_list = ['OL']


for t in ['30wk']:
    all_results=[]
    for c in c_list:
        down = set(filterdeg(downsig, ctype=c, timepoint=t))
        up = set(filterdeg(upsig, ctype=c, timepoint=t))

        res = gseapy.enrichr(gene_list=list(down), organism='mouse', gene_sets=geneset, cutoff=0.05)
        df1 = res.res2d[res.res2d['Adjusted P-value'] < 0.05]
        res = gseapy.enrichr(gene_list=list(up), organism='mouse', gene_sets=geneset, cutoff=0.05)
        df2 = res.res2d[res.res2d['Adjusted P-value'] < 0.05]
        
        df1['Cell type'] = c
        df2 = df2.copy()
        df2['Cell type'] = c
        df1['Timepoint'] = t
        df2 = df2.copy()
        df2['Timepoint'] = t
        
        # Sort by the chosen method
        if sort_by == 'combined_score':
            df1['Combined Score'] = df1['Combined Score']*-1
            df1 = df1.sort_values(by=['Combined Score'], ascending=False).head(5)
            df2 = df2.sort_values(by=['Combined Score'], ascending=False).head(5)
            df = pd.concat([df1, df2])
            df = df.sort_values(by=['Combined Score'], ascending = False)            
            color_data = df['-log10(Adjusted P-value)'] = df['Adjusted P-value'].apply(np.log10) * -1
        elif sort_by == 'p_value':
            
            df1['-log10(Adjusted P-value)'] = df1['Adjusted P-value'].apply(np.log10)
            df2 = df2.copy()
            df2['-log10(Adjusted P-value)'] = df2['Adjusted P-value'].apply(np.log10) * -1

            df1 = df1.sort_values(by=['-log10(Adjusted P-value)'], ascending=False).head(10)
            df2 = df2.sort_values(by=['-log10(Adjusted P-value)'], ascending=False).head(10)
            df = pd.concat([df1, df2])
            df = df.sort_values(by=['-log10(Adjusted P-value)'], ascending = False)                
        else:
            raise ValueError("sort_by must be 'combined_score' or 'p_value'")

        all_results.append(df)
        
    df_combined = pd.concat(all_results)

    if sort_by == 'combined_score':          
        x_data = df_combined['Combined Score'].to_list()
        color_data = df_combined['-log10(Adjusted P-value)'].to_list()
        x_label = 'Combined Score'
        color_label = '-log10(Adjusted P-value)'
        cmap_label = '-log10(Adjusted P-value)'
    elif sort_by == 'p_value':              
        x_data = df_combined['-log10(Adjusted P-value)'].to_list()
        color_data = df_combined['Combined Score'].to_list()
        x_label = '-log10(Adjusted P-value)'
        color_label = 'Combined Score'
        cmap_label = 'Combined Score'
    else:
        raise ValueError("sort_by must be 'combined_score' or 'p_value'")
            

    df_combined['Term'] = df_combined['Term'].apply(lambda x: x[:-13])  # Remove '(GO:xxxxxxxxx)'
#     df_combined['Term2'] = df_combined['Timepoint']+'_'+df_combined['Cell type']+ ' '+df_combined['Term']
    GO_biological_processes = df_combined['Term'].to_list()

#     # Check if there's data to plot
#     if df.empty:
#         print(f"No significant GO terms to plot for {title}.")
#         return
    # Create figure and axis
    fig, ax = plt.subplots(1, 1, figsize=(2,2))

    # Create horizontal bars
    bars = ax.barh(GO_biological_processes, x_data, color='gray')

    # Create a ScalarMappable for color mapping
    sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(vmin=min(color_data), vmax=max(color_data)))

    # Set colors for the bars
    colors = [sm.to_rgba(score) for score in color_data]
    for bar, color in zip(bars, colors):
        bar.set_color(color)

        
#     # Add labels for cell types
#     for bar, cell_type in zip(bars, df_combined['Cell type']):
#         ax.text(
#             bar.get_width(),  # x-coordinate of the label
#             bar.get_y() + bar.get_height() / 2,  # y-coordinate of the label (centered vertically)
#             cell_type,  # Label text
#             va='center',  # Vertical alignment
#             ha='left',  # Horizontal alignment
#             fontsize=8,  # Font size for the label
#             color='black'  # Text color
#         )
        
        
    # Create a divider to make room for the colorbar
    divider = make_axes_locatable(ax)

    # Append an axis for the colorbar on the right side of the main plot
    cax = divider.append_axes("right", size="5%", pad=0.05)

    # Add a colorbar to the plot
    cbar = plt.colorbar(sm, cax=cax)
    cbar.set_label(cmap_label, color = 'black')

    # Set labels and title
    ax.set_xlabel(x_label, color = 'black')
    ax.set_ylabel(geneset, color = 'black')
    ax.set_title(t, color = 'black')
    ax.grid(False)
    
    # Customize axes tick labels to be black
    ax.tick_params(axis='x', colors='black')
    ax.tick_params(axis='y', colors='black')
    ax.axvline(x=0, color = 'black', linestyle='-', linewidth=0.5)

    # Adjust x tick labels font size
    plt.xticks(fontsize=8, color='black')
    plt.yticks(color='black')

    # Show or save the plot
#     fig.savefig(os.path.join(pfp, '250415_cKI_OL_enrichr_'+t+'_pval.pdf'), dpi=300, bbox_inches='tight')
    plt.show()
 

In [None]:
import os, sys, glob, re, math, pickle
import pandas as pd
import time,random,datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib_venn
import scanpy as sc
import warnings
import csv
%matplotlib inline
%load_ext memory_profiler

# settings
plt.rc('font', size = 8)
plt.rc('font', family='sans serif')
plt.rcParams['pdf.fonttype']=42
plt.rcParams['ps.fonttype']=42
plt.rcParams['text.usetex']=False
plt.rcParams['legend.frameon']=False
plt.rcParams['axes.grid']=False
plt.rcParams['legend.markerscale']=0.5
sc.set_figure_params(dpi=300,dpi_save=600,
                     frameon=False,
                     fontsize=8)
plt.rcParams['savefig.dpi']=600
sc.settings.verbosity=2
sc._settings.ScanpyConfig.n_jobs=-1

# reproducibility
rs = np.random.seed(42)

# fps
dfp = '/vast/palmer/pi/lim_janghoo/cl2292/'
pfp = '/vast/palmer/pi/lim_janghoo/cl2292/SCA1_OL/results/'
pdfp = '/vast/palmer/pi/lim_janghoo/cl2292/SCA1_OL/data/'
sc.settings.figdir = pfp

# load DEGs

#OL-SCA1-cKI 30W

dge = pd.read_csv('/vast/palmer/pi/lim_janghoo/cl2292/SCA1_OL/results/250414_dge_SCA1-flwCre vs SCA1-flwoCre.csv')
downsig_cKI = dge.loc[(dge['emd']<=-0.1)&(dge['pval_corrected']<0.01),:]
upsig_cKI = dge.loc[(dge['emd']>=0.1)&(dge['pval_corrected']<0.01),:]

#SCA1-KI 5-30W

KI = pd.read_csv(pfp+'250414_dge_KI_imp.csv')
downsig_KI = KI.loc[(KI['emd']<=-0.1)&(KI['pval_corrected']<0.01),:]
upsig_KI = KI.loc[(KI['emd']>=0.1)&(KI['pval_corrected']<0.01),:]

def filterdeg(df, ctype, timepoint=None):
    if 'Cell type' not in df.columns or 'Gene' not in df.columns:
        raise ValueError("DataFrame must contain 'Cell type' and 'Gene' columns.")
    
    if timepoint is None:
        filtered_df = df.loc[df['Cell type'] == ctype, :]
    else:
        if 'timepoint' not in df.columns:
            raise ValueError("DataFrame must contain 'timepoint' column for filtering by timepoint.")
        filtered_df = df.loc[(df['Cell type'] == ctype) & (df['timepoint'] == timepoint), :]

    return filtered_df['Gene'].to_list()

c_list = downsig_cKI['Cell type'].unique().tolist()
nrow = len(c_list)

fig, axs = plt.subplots(nrow, 1, figsize=(2, nrow*2))
for i, c in enumerate(c_list): 
    # Get sets of genes for venn diagrams
    cKIdown = set(filterdeg(downsig_cKI, ctype=c))
    KIdown = set(filterdeg(downsig_KI, ctype=c))

    # Plot the Venn diagram for "DOWN"
    ax = axs[i]  # Select the corresponding subplot
    matplotlib_venn.venn2([cKIdown, KIdown], ('OL-SCA1-cKI 30W DOWN', 'SCA1-KI 5-30W DOWN'),
                          set_colors=('#FF9999','#9999FF'), alpha = 0.8, ax=ax)
    matplotlib_venn.venn2_circles([cKIdown, KIdown], linewidth=0.6, ax=ax)
    ax.set_title(f"{c} DOWN")
    
fig.savefig(os.path.join(pfp, '250415_venn_overlappaing DEG_down.pdf'), dpi=300, bbox_inches='tight')

fig, axs = plt.subplots(nrow, 1, figsize=(2, nrow*2))
for i, c in enumerate(c_list): 
    # Get sets of genes for venn diagrams
    cKIup = set(filterdeg(upsig_cKI, ctype=c))
    KIup = set(filterdeg(upsig_KI, ctype=c))

    # Plot the Venn diagram for "UP"
    ax = axs[i]  # Select the corresponding subplot
    matplotlib_venn.venn2([cKIup, KIup], ('OL-SCA1-cKI 30W UP', 'SCA1-KI 5-30W UP'),
                          set_colors=('#FF9999','#9999FF'), alpha = 0.8, ax=ax)
    matplotlib_venn.venn2_circles([cKIup, KIup], linewidth=0.6, ax=ax)
    ax.set_title(f"{c} UP")
    
fig.savefig(os.path.join(pfp, '250415_venn_overlappaing DEG_up.pdf'), dpi=300, bbox_inches='tight')