# Topyfic

Vignette built on Sep 16, 2024 with Topyfic version 0.4.17.

The aim of this example is to show how to prepare and run Topyfic on your dataset.

## load libraries

In [None]:
import pandas as pd
import numpy as np
import anndata as ad
import scanpy as sc
import scipy as sp
import mudata as md

## Preprocess and Input data

The input data is  Mudata or h5ad file contains raw or normalized count matrix. However, we recommend you to do [depth normalization](https://www.biorxiv.org/content/10.1101/2022.05.06.490859v1.full).

In [None]:
def depth_normalization(mtx):
    pf = mtx.sum(axis=1).A.ravel()
    log1p_pf = np.log1p(sp.sparse.diags(pf.mean()/pf) @ mtx)
    
    pf = log1p_pf.sum(axis=1).A.ravel()
    pf_log1p_pf = sp.sparse.diags(pf.mean()/pf) @ log1p_pf
    
    return pf_log1p_pf

data = md.read_h5mu("iPSC_EC.h5mu")['rna']
#data = ad.read("iPSC_EC_RNA.h5ad")

data.X = normalization(data.X)
data.X = np.round_(data.X)

data.write("iPSC_EC_Topyfic.h5ad")

## Run Topyfic

After preparing the input, you can run Topyfic using snakemake.

To get more information about how to set up your environment and necessary files please look at the [Topyfic Repository](https://github.com/mortazavilab/Topyfic/tree/main/workflow/snakemake).

The config file for this dataset should look like this:
```
names: iPSC_EC

count_adata:
  iPSC_EC:
    resources/iPSC_EC_Topyfic.h5ad

n_topics: [5, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 30, 35, 40, 45, 50]

organism: human

workdir: results

train:
  n_runs: 100
  random_states: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                  20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
                  30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                  40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
                  50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                  60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
                  70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                  80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
                  90, 91, 92, 93, 94, 95, 96, 97, 98, 99]

top_model:
  n_top_genes: None
  resolution: 1
  max_iter_harmony: 10
  min_cell_participation: None

```

Once you run the Topfic completely, you can decide which k(number of topics) is the best one by looking at the plot below and look for the first k that crosses `x=y` line.


In [None]:
import yaml
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Read yaml file
with open('config/config.yaml', 'r') as f:
    config = yaml.load(f, Loader=yaml.SafeLoader)


df = pd.DataFrame(columns=['k', 'N'])

for k in config['n_topics']:
    name = f"{config['workdir']}/{config['names']}/{k}/topmodel/topModel_{config['names']}_{k}.p"
    #print(name)
    top_model = Topyfic.read_topModel(name)
    
    tmp = pd.DataFrame([[k, top_model.N]], columns=['k', 'N'])
    df = pd.concat([df, tmp])

x = df['k'].tolist()
y = df['N'].tolist()

plt.plot(x, y, 'o')
    
# Define interpolators.
x = df['k'].values.reshape(-1, 1)
y = df['N'].values.reshape(-1, 1)
model = LinearRegression()
model = LinearRegression().fit(x, y)
r_sq = model.score(x, y)

x_pred = np.linspace(5, 50, num=41, endpoint=True)
y_pred = model.predict(x_pred.reshape(-1, 1))
plt.plot(x_pred, y_pred, 
             linestyle='dashed', 
             label=fr'R^2 = {round(r_sq, 5)}')

    
plt.plot([0] + config['n_topics'], [0] + config['n_topics'], linestyle='solid', label='x=y')    
#plt.axvline(x=10, color="red", linestyle="solid", label='k=10')
plt.xlabel('K (#topics in each LDA run)')
plt.ylabel('N (#rLDA topics)')
plt.ylim([0,50])
plt.xlim([0,50])
plt.title(f"{config['names']}")
plt.legend(loc='best')
plt.savefig(f'k_n_prediction.pdf')

## Convert outputs for evaluation pipeline

To be able to run evaluation pipeline, we need to convert the Toprfic results in the suitable format.
This script would convert your results to the expected muData.

In [None]:
import mudata as md
import anndata as ad
import scanpy as sc
import Topyfic
import pandas as pd

adata = sc.read_h5ad(config['count_adata'][config['names']])
adata.layers['PFlog1pPF_normalization'] = adata.X

for k in config['n_topics']:
    
    name = f"{config['workdir']}/{config['names']}/{k}/topmodel/analysis_{config['names']}_{k}.p"

    analysis = Topyfic.read_analysis(name)

    df_gene_weights = analysis.top_model.get_gene_weights()

    topyfic = analysis.cell_participation.copy()

    topyfic.var.index = df_gene_weights.columns.tolist()

    topyfic.uns['var_names'] = df_gene_weights.index.to_numpy()
    topyfic.varm['loadings'] = df_gene_weights.to_numpy().T

    mdata = md.MuData({"rna": adata, "Topyfic": topyfic})
    
    mdata.write(f"{config['workdir']}/{config['names']}/{k}/topmodel/Topyfic_{config['names']}_{k}.h5mu")

## Downstream analysis

Topyfic provides useful visualization and downstream analysis that can be run directly using Topyfic output.

For more information look at [Topyfic GH](https://github.com/mortazavilab/Topyfic/blob/main/workflow/snakemake/resources/analysing.ipynb).

In [None]:
import os
import scanpy as sc
import Topyfic
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
#mpl.rcParams['font.family'] = 'Arial'
mpl.rcParams['pdf.fonttype'] = 42
sns.set_context('paper')

# Get the K (number of topics to start with)
k = int(str(Path.cwd()).split("/")[-2])

# Read yaml file
with open('../../../../config/config.yaml', 'r') as f:
    config = yaml.load(f, Loader=yaml.SafeLoader)
#config

#Create subdirectories
if not os.path.isdir("figures/pieChart"):
    os.mkdir("figures/pieChart", mode=0o777)
if not os.path.isdir("figures/structurePlot"):
    os.mkdir("figures/structurePlot", mode=0o777)

#Read analysis object
name = f"{config['workdir']}/{config['names'][0]}/{k}/topmodel/analysis_{config['names'][0]}_{k}.p"
analysis = Topyfic.read_analysis(name)

In [None]:
# cell program distribution
analysis.cell_participation_distribution(plot_type="violin",
                                         threshold=0.05,
                                         max_topic=True,
                                         color="#CF99F7",
                                         save=True,
                                         show=False,
                                         file_format="pdf",
                                         file_name=f"figures/Dist_cell_topic_{config['names'][0]}_{k}")

In [None]:
#pie chart

celltypes = analysis.cell_participation.obs.celltype.unique().tolist()
for celltype in celltypes:
    subtypes = analysis.cell_participation.obs[analysis.cell_participation.obs.celltype == celltype].subtype.unique().tolist()
    file_name = f"figures/pieChart/dist_topics_celltypes_RNA_{celltype}"
    analysis.pie_structure_Chart(level='subtype',
                                           category=subtypes,
                                            save=True,
                                            show=False,
                                           file_format='png',
                                            file_name=file_name)

subtypes = analysis.cell_participation.obs.subtype.unique().tolist()
file_name = f"figures/pieChart/dist_topics_subtypes_RNA"
analysis.pie_structure_Chart(level='subtype',
                                       category=subtypes,
                                       save=True,
                                       show=False,
                                       file_format='png',
                                       file_name=file_name)

In [None]:
# structure plot
my_palette_genotype = {'CASTJ': 'red',
                        'B6J': 'blue'}
my_palette_sex = {'Male': 'yellow',
                  'Female': 'orange'}
    
my_palette = {'Sex': my_palette_sex,
              'Genotype': my_palette_genotype}

celltypes = analysis.cell_participation.obs.celltype.unique().tolist()
for celltype in celltypes:
    subtypes = analysis.cell_participation.obs[analysis.cell_participation.obs.celltype == celltype].subtype.unique().tolist()
    file_name = f"figures/structurePlot/dist_topics_celltypes_RNA_timepoint_{celltype}"
    analysis.structure_plot(level='subtype',
                                      category=subtypes, 
                                      metaData=["Sex", "Genotype"],
                                      metaData_palette=my_palette,
                                      order_cells=["Sex", "Genotype", "hierarchy"],
                                      save=True,
                                      show=False,
                                      file_format='png',
                                      file_name=file_name)

subtypes = analysis.cell_participation.obs.subtype.unique().tolist()
file_name = f"figures/structurePlot/dist_topics_subtypes_RNA_timepoint_hierarchical"
analysis.structure_plot(level='subtype',
                        category=subtypes,
                        metaData=["Sex", "Genotype"],
                        metaData_palette=my_palette,
                        order_cells=["Sex", "Genotype", "hierarchy"],
                        save=True,
                        show=False,
                        file_format='png',
                        file_name=file_name)

In [None]:
# Topic-Trait Relationship Heatmap
metadata = ['Genotype', 'Sex', 'general_celltype']
analysis.TopicTraitRelationshipHeatmap(metadata,
                                                 save=True,
                                                 show=True,
                                                 file_format='png',
                                                 file_name='figures/topic-traitRelationships_general_poster')