In [None]:
import pylab
import pandas as pd
import numpy as np
import os
from scipy import stats
from scipy.stats import ks_2samp
from sklearn.mixture import GaussianMixture
from scipy.optimize import minimize

def dir_path(sPath):
    total_path = []

    def print_dir_contents(sPath):
        nonlocal total_path

        for schild in os.listdir(sPath):
            sChildPath = os.path.join(sPath, schild)
            if os.path.isdir(sChildPath):
                print_dir_contents(sChildPath)
            elif schild.endswith('.csv'):
                total_path.append(sChildPath)

        return total_path

    total_path = print_dir_contents(sPath)
    return total_path

def em_dweibull(data, max_iter=100, tol=1e-4):
    def log_likelihood(params, data):
        c, loc, scale = params
        return -np.sum(stats.dweibull.logpdf(data, c, loc, scale))
    c_init = 1.0
    loc_init = np.mean(data)
    scale_init = np.std(data)
    params_init = np.array([c_init, loc_init, scale_init])
    
    result = minimize(log_likelihood, params_init, args=(data,), method='L-BFGS-B',
                      bounds=[(0.1, None), (None, None), (0.1, None)])

    c, loc, scale = result.x
    return c, loc, scale

def gaussian_mixture_distribution(csv_path):
    mydata = pd.read_csv(csv_path)
    mydata = mydata.T
    mydata.columns = mydata.iloc[0]
    mydata = mydata.drop(mydata.index[0])
    mydata = mydata.astype('float64')

    all_dists_df = pd.DataFrame(columns=['gene', 'Distribution', 'param', 'sumsquare_error', 'aic', 'bic', 'ks', 'ks_pvalue'])
    for gene in list(mydata.columns):
        data = mydata[gene]
        if (data==0).all() == False:
            data_2d = data.values.reshape(-1, 1)

            dis_num = np.array([5])  
            results = []
            for n in dis_num:
                gmm = GaussianMixture(n_components=n, covariance_type='full', max_iter=100, tol=1e-4)
                gmm.fit(data_2d)
                param = gmm.get_params()
                sse = pylab.sum((gmm.predict_proba(data_2d) - data_2d) ** 2)
                aic = gmm.aic(data_2d)
                bic = gmm.bic(data_2d)
                ks_statistic, p_value = ks_2samp(data_2d[:, 0], gmm.sample(len(data_2d))[0][:, 0])
                results.append((gene, n, param, sse, aic, bic, ks_statistic, p_value))

            df_results = pd.DataFrame(results, columns=['gene', 'Distribution', 'param', 'sumsquare_error', 'aic', 'bic', 'ks', 'ks_pvalue']).sort_values('bic')
            all_dists_df = pd.concat([all_dists_df, df_results], axis=0)

    all_dists_df = all_dists_df.reset_index(drop=True)
    return all_dists_df

def set_distribution(csv_path):
    mydata = pd.read_csv(csv_path)
    mydata = mydata.T
    mydata.columns = mydata.iloc[0]
    mydata = mydata.drop(mydata.index[0])
    mydata = mydata.astype('float64')

    all_dists_df = pd.DataFrame(columns=['gene', 'Distribution', 'param', 'sumsquare_error', 'aic', 'bic', 'ks', 'ks_pvalue'])
    for gene in list(mydata.columns):
        data = mydata[gene]
        if (data==0).all() == False:
            results = []
            dist_names = ['norm', 't', 'pareto', 'genextreme', 'laplace', 'cauchy', 'chi2', 'expon', 'exponpow', 'gamma', 'beta', 'lognorm', 'loggamma', 'uniform']
            for dist_name in dist_names:
                dist = getattr(stats, dist_name)
                param = dist.fit(data)
                log_likelihood = np.sum(dist.logpdf(data, *param))
                pdf_fitted = dist.pdf(data, *param[:-2], loc=param[-2], scale=param[-1])
                sse = np.sum(np.power(pdf_fitted - data, 2.0))
                n = len(data)
                k = len(param)
                aic = -2 * log_likelihood + 2 * k
                bic = -2 * log_likelihood + k * np.log(len(data))
                ks_test, p_value = stats.kstest(data, dist_name, args=param)
                results.append((gene, dist_name, param, sse, aic, bic, ks_test, p_value))

            # EM for dweibull
            dweibull_param = em_dweibull(data)
            log_likelihood = np.sum(stats.dweibull.logpdf(data, *dweibull_param))
            pdf_fitted = stats.dweibull.pdf(data, *dweibull_param)
            sse = np.sum(np.power(pdf_fitted - data, 2.0))
            n = len(data)
            k = len(dweibull_param)
            aic = -2 * log_likelihood + 2 * k
            bic = -2 * log_likelihood + k * np.log(len(data))
            ks_test, p_value = stats.kstest(data, 'dweibull', args=dweibull_param)
            results.append((gene, 'dweibull_em', dweibull_param, sse, aic, bic, ks_test, p_value))

            df_results = pd.DataFrame(results, columns=['gene', 'Distribution', 'param', 'sumsquare_error', 'aic', 'bic', 'ks', 'ks_pvalue']).sort_values('bic')
            all_dists_df = pd.concat([all_dists_df, df_results], axis=0)

    all_dists_df = all_dists_df.reset_index(drop=True)
    return all_dists_df

def main(sPath):
    total_path = dir_path(sPath) 
    all_files_best_dists = pd.DataFrame()
    all_files_all_dists = pd.DataFrame()

    for csv_path in total_path:
        gaussian_all_dists_df = gaussian_mixture_distribution(csv_path)
        other_all_dists_df = set_distribution(csv_path)
        all_dists = pd.concat([gaussian_all_dists_df, other_all_dists_df], axis=0)

        all_files_all_dists = pd.concat([all_files_all_dists, all_dists], axis=0)
        finally_data = pd.DataFrame(columns=['gene', 'Distribution', 'param', 'sumsquare_error', 'aic', 'bic', 'ks', 'ks_pvalue'])
        grouped = all_dists.groupby('gene')
        for name, group in grouped:
            for idx, row in group.sort_values('bic').iterrows():
                if row['ks_pvalue'] > 0.01:
                    finally_data = pd.concat([finally_data, row.to_frame().T], axis=0)
                    break
            else:
                finally_data = pd.concat([finally_data, group.sort_values('bic').iloc[0:1]], axis=0)
        all_files_best_dists = pd.concat([all_files_best_dists, finally_data], axis=0)
    out_path_best = 'all_best_dists_results.xlsx'
    all_files_best_dists.to_excel(out_path_best, index=False)

    out_path_all = 'all_dists_results.xlsx'
    all_files_all_dists.to_excel(out_path_all, index=False)

  

if __name__ == '__main__':
    sPath = r"D:\CSJ\WORKSPACE2...."
    main(sPath)
