From a50ed6f1cbdc54274e0ade69ad567dd249e40ac5 Mon Sep 17 00:00:00 2001 From: marcmaxson Date: Mon, 19 Aug 2019 15:23:59 -0600 Subject: [PATCH] compare_mds returns list of transformed dataframes now --- methQC/__init__.py | 5 ++- methQC/postprocessQC.py | 70 +++++++++++++++++++++++++++-------------- 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/methQC/__init__.py b/methQC/__init__.py index d712f7d..78d4913 100644 --- a/methQC/__init__.py +++ b/methQC/__init__.py @@ -2,7 +2,9 @@ from logging import NullHandler, getLogger # App from .cli import detect_array -from .postprocessQC import mean_beta_plot, beta_density_plot, beta_mds_plot, cumulative_sum_beta_distribution, mean_beta_compare +from .postprocessQC import ( + mean_beta_plot, beta_density_plot, beta_mds_plot, + cumulative_sum_beta_distribution, mean_beta_compare, combine_mds) from .filters import exclude_sex_control_probes, list_problem_probes, exclude_probes getLogger(__name__).addHandler(NullHandler()) @@ -18,4 +20,5 @@ 'list_problem_probes', 'mean_beta_plot', 'mean_beta_compare', + 'combine_mds' ] diff --git a/methQC/postprocessQC.py b/methQC/postprocessQC.py index fcfa4e5..03b6671 100644 --- a/methQC/postprocessQC.py +++ b/methQC/postprocessQC.py @@ -349,7 +349,7 @@ def beta_mds_plot(df, filter_stdev=1.5, verbose=True, save=False, silent=False, ax.hlines([minY, maxY], minX, maxX, color=COLORSET.get(color_num,'red'), linestyle=':') if multi_params.get('return_plot_obj') == True: - return fig, ax + return fig, ax, df_indexes_to_retain if silent == True: # take the original dataframe (df) and remove samples that are outside the sample thresholds, returning a new dataframe @@ -431,9 +431,9 @@ def mean_beta_compare(df1, df2, save=False, verbose=False, silent=False): data2['mean'] = data2.mean(numeric_only=True, axis=1) fig, ax = plt.subplots(figsize=(12, 9)) - line1 = sns.distplot(data1['mean'], hist=False, rug=False, ax=ax, axlabel='beta') - line2 = sns.distplot(data2['mean'], hist=False, rug=False) - plt.title('Mean Beta Plot (Compare pre vs post filtering)') + line1 = sns.distplot(data1['mean'], hist=False, rug=False, ax=ax, axlabel='beta', color='xkcd:blue') + line2 = sns.distplot(data2['mean'], hist=False, rug=False, color='xkcd:green') + plt.title('Mean Beta Plot (Compare pre (blue) vs post (green) filtering)') plt.grid() plt.xlabel('Mean Beta') #plt.legend([line1, line2], ['pre','post'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) @@ -484,10 +484,11 @@ def combine_mds(*args, **kwargs): returns ------ - nothing returned (currently) + - TODO: one dataframe of the retained samples, cutoff box is avg of datasets + ~~nothing returned (currently)~~ + - TODO: each dataset's results as a transformed file - default: list of samples retained or excluded - option: a list of pyplot subplot objects - - TODO: one dataframe of the retained samples. """ # check if already dataframes, or are they strings? @@ -503,14 +504,16 @@ def combine_mds(*args, **kwargs): silent = kwargs.get('silent', False) verbose = kwargs.get('verbose', True) filter_stdev = kwargs.get('filter_stdev', 1.5) + output_format = kwargs.get('output') PRINT = print if verbose else _noprint - # data to combine dfs = pd.DataFrame() # OPTIONAL OUTPUT: TRACK each source df's samples for plot color coding prior to merging # i.e. was this sample included or excluded at end? + # maybe use a simple class here to track the data as it is being manipulated? sample_source = {} + frame_transposed = {} # subplots: possibly useful - list of subplot objects within the figure, and their metadata. subplots = [] # track x/y ranges to adjust plot area later @@ -520,17 +523,19 @@ def combine_mds(*args, **kwargs): fig = None ax = None for idx, df in enumerate(list_of_dfs): + transposed = False if df.shape[1] > df.shape[0]: # put probes in rows df = df.transpose() + transposed = True for sample in df.columns: + if sample in sample_source: + print("WARNING: sample names are not unique across data sets") sample_source[sample] = idx + frame_transposed[idx] = transposed - # PLOT separate MDS - # PRINT(idx, fig, ax) - #-- only draw last iteration: draw_box = True if idx == len(list_of_dfs)-1 else False - #-- draw after complete, using dimensions provided + # first, PLOT separate MDS, each with its own box. try: - fig,ax = beta_mds_plot(df, filter_stdev=filter_stdev, save=save, verbose=verbose, silent=silent, + fig,ax,df_indexes_to_retain = beta_mds_plot(df, filter_stdev=filter_stdev, save=save, verbose=verbose, silent=silent, multi_params={'return_plot_obj':True, 'fig':fig, 'ax':ax, @@ -568,19 +573,33 @@ def combine_mds(*args, **kwargs): PRINT('Average MDS window coordinate range: (x: {0}, y:{1}'.format(xy_lim[0], xy_lim[1])) fig = None ax = None + transformed_dfs = [] for idx, df in enumerate(list_of_dfs): if df.shape[1] > df.shape[0]: # put probes in rows df = df.transpose() - fig,ax = beta_mds_plot(df, filter_stdev=filter_stdev, save=save, verbose=verbose, silent=silent, - multi_params={ - 'return_plot_obj':True, - 'fig':fig, - 'ax':ax, - 'color_num':idx, - 'draw_box':True, - 'xy_lim':xy_lim, - 'PSF':1.2, - }) + fig,ax,df_indexes_to_retain = beta_mds_plot(df, + filter_stdev=filter_stdev, save=save, verbose=verbose, silent=silent, + multi_params={ + 'return_plot_obj':True, + 'fig':fig, + 'ax':ax, + 'color_num':idx, + 'draw_box':True, + 'xy_lim':xy_lim, + 'PSF':1.2, + }) + try: + if frame_transposed[idx]: + # this data (df) was transposed from file-orientation, so samples are in rows now; probes in columns + df_transformed = df.iloc[df_indexes_to_retain, :] # samples in rows + print(df.shape, df_transformed.shape) + else: + # iloc: first list is row index; 2nd is column index + df_transformed = df.iloc[:, df_indexes_to_retain] # samples in columns + print(df.shape, df_transformed.shape) + transformed_dfs.append(df_transformed) + except: + import pdb;pdb.set_trace() fig.axes[0].set_xlim([x_range_min, x_range_max]) fig.axes[0].set_ylim([y_range_min, y_range_max]) @@ -590,14 +609,16 @@ def combine_mds(*args, **kwargs): all_coords = [] retained = [] excluded = [] + retained_sample_dfs = [] for i in range(0, 4*len(list_of_dfs), 1): DD = fig.axes[0].collections[i] DD.set_offset_position('data') #print(DD.get_offsets()) all_coords.extend(DD.get_offsets().tolist()) #print(i, len(all_coords)) - if i % 4 == 0: # 0, 4, 8 -- this is the first data set applied to plot. + if i % 4 == 0: # 0, 4, 8 -- this is the first data set applied to plot. (x,y plot coords) retained.extend( DD.get_offsets().tolist() ) + # go from plot sample x,y to idx of samples in original dfs. if i % 4 == 1: # 1, 5, 9, etc -- this is the second data set applied to plot. excluded.extend( DD.get_offsets().tolist() ) if verbose: @@ -608,6 +629,9 @@ def combine_mds(*args, **kwargs): plt.show() plt.close('all') + # TODO: output_format + return transformed_dfs + def _load_data(filepaths): dfs = []