# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
import os
from tqdm import tqdm
import glob
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import plotly.express as px
import statsmodels.formula.api as smf
import plotly.graph_objects as go
from statsmodels.stats.multitest import multipletests
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode
from matplotlib import patheffects as pe
import matplotlib
import warnings
init_notebook_mode(connected=False)
from scipy.stats import mannwhitneyu, median_test, kruskal, wilcoxon, friedmanchisquare
import matplotlib.pyplot as plt
import pathlib
from sklearn.metrics import mean_absolute_error
from statannotations.Annotator import Annotator
import functools
import matplotlib.lines as mlines
import patchworklib as pw
import pickle
from src.routines.plotly_layout import add_layout, color_tick
from d3blocks import D3Blocks

# Collect data

In [None]:
path = "D:/YandexDisk/Work/pydnameth/draft/10_MetaEPIClock/MetaEpiAge"
df = pd.read_excel(f"{path}/table.xlsx", index_col=0)
colors_gpls = {
    'GPL13534': 'firebrick',
    'GPL16304': 'orangered',
    'GPL21145': 'darkorchid',
    'GPL23976': 'orchid'
}
df_gpls = {}
for gpl in colors_gpls:
    df_gpls[gpl] = pd.read_excel(f"{path}/figures/gpls/{gpl}.xlsx", index_col=0)

# Plot samples count hist

In [None]:
path_save = f"{path}/figures/gpls/histplot"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

hist_bins = np.linspace(0, 4000, 41)

df_stat = pd.DataFrame(index=list(colors_gpls.keys()), columns=['Total', 'Processed', 'Processed %', 'Selected', 'Selected %'])
for gpl, color in colors_gpls.items():
    fig, ax = plt.subplots(figsize=(4, 3))
    sns.set_theme(style='whitegrid')
    histplot = sns.histplot(
        data=df_gpls[gpl],
        x=f"Count",
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        color=color,
        ax=ax
    )
    ax.set_xlabel('Number of samples in GSE')
    ax.set_ylabel('Number of GSEs')
    ax.set_xlim([-15, 4000])
    plt.savefig(f"{path_save}/{gpl}.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path_save}/{gpl}.pdf", bbox_inches='tight')
    plt.close(fig)
    
    count_total = df_gpls[gpl]['Count'].sum()
    count_proc = df_gpls[gpl].loc[df_gpls[gpl]['Count'] >= 100, 'Count'].sum()
    count_slctd = len(df.index[df['GPL']==gpl])
    df_stat.at[gpl, 'Total'] = count_total
    df_stat.at[gpl, 'Processed'] = count_proc
    df_stat.at[gpl, 'Processed % '] = count_proc / count_total * 100
    df_stat.at[gpl, 'Selected'] = count_slctd
    df_stat.at[gpl, 'Selected % '] = count_slctd / count_total * 100

df_stat.to_excel(f"{path_save}/stat.xlsx", index_label="GPL")
        