plot legend fix; filters on misnamed probes fixed

FoxoTech · Jul 24, 2019 · b8a2b0d · b8a2b0d
1 parent 8fcccc3
commit b8a2b0d
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 23 deletions.
diff --git a/methQC/filters.py b/methQC/filters.py
@@ -91,8 +91,16 @@ def exclude_sex_control_probes(df, array, no_sex=True, no_control=True, verbose=
     exclude_sex = _import_probe_exclusion_list(array, 'sex') if no_sex == True else []
     exclude_control = _import_probe_exclusion_list(array, 'control') if no_control == True else []
     exclude = list(exclude_sex)+list(exclude_control)
+
+    # first check shape of df; probes are in the longer axis.
+    FLIPPED = False
+    if len(df.columns) > len(df.index):
+        df = df.transpose()
+        FLIPPED = True # use this to reverse at end, so returned in original orientation.
+
     # next, actually remove probes from all samples matching these lists.
     filtered = df.drop(exclude, errors='ignore')
+
     # errors = ignore: if a probe is missing from df, or present multiple times, drop what you can.
     if verbose == True:
         print("""Array {4}: Removed {0} sex linked probes and {1} internal control probes \
@@ -114,6 +122,9 @@ def exclude_sex_control_probes(df, array, no_sex=True, no_control=True, verbose=
             print("It appears that your sample had no control probes, or that the control probe names didn't match the manifest ({0}).".format(array))
         else:
             print("This happens when probes are present multiple times in array, or the manifest doesn't match the array ({0}).".format(array))
+    # reverse the FLIP
+    if FLIPPED:
+        filtered = filtered.transpose()
     return filtered
 
 def exclude_probes(array, probe_list):
@@ -132,10 +143,16 @@ def exclude_probes(array, probe_list):
     It then drops probes from array that match probe_list, at least partially.
 
     ADDED: checking whether array.index is string or int type. Regardless, this should work and not alter the original index."""
-    pre_overlap = len(set(array.index) & set(probe_list))
+
+    # 1 - check shape of array, to ensure probes are the index/values
+    ARRAY = array.index
+    if len(array.index) < len(array.columns):
+        ARRAY = array.columns
+
+    pre_overlap = len(set(ARRAY) & set(probe_list))
 
     # probe_list is always a list of strings. COERCING to strings here for matching.
-    array_probes = [str(probe).split('_')[0] for probe in list(array.index)]
+    array_probes = [str(probe).split('_')[0] for probe in list(ARRAY)]
     post_overlap = len(set(array_probes) & set(probe_list))
 
     if pre_overlap != post_overlap:
@@ -144,7 +161,7 @@ def exclude_probes(array, probe_list):
         print("Of {0} probes, {1} matched, yielding {2} probes after filtering.".format(len(array), post_overlap, len(array)-post_overlap))
     if post_overlap >= pre_overlap:
         # match which probes to drop from array.
-        array_probes_lookup = {str(probe).split('_')[0]: probe for probe in list(array.index)}
+        array_probes_lookup = {str(probe).split('_')[0]: probe for probe in list(ARRAY)}
         # get a list of unmodified array_probe_names to drop, based on matching the overlapping list with the modified probe names.
         exclude = [array_probes_lookup[probe] for probe in (set(array_probes) & set(probe_list))]
         return array.drop(exclude, errors='ignore')

diff --git a/methQC/postprocessQC.py b/methQC/postprocessQC.py
@@ -5,6 +5,7 @@
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
+import matplotlib.lines as mlines
 from sklearn.manifold import MDS
 from tqdm import tqdm
 import datetime
@@ -32,7 +33,6 @@ def mean_beta_plot(df, verbose=False, save=False, silent=False):
     plt.title('Mean Beta Plot')
     plt.grid()
     plt.xlabel('Mean Beta')
-    plt.ylabel('Count')
     if save:
         plt.savefig('mean_beta.png')
     if not silent:
@@ -59,20 +59,27 @@ def beta_density_plot(df, verbose=False, save=False, silent=False):
         df = df.copy().transpose() # don't overwrite the original
 
     fig, ax = plt.subplots(figsize=(12, 9))
-    for col in df.columns:
-        if col != 'Name':
-            sns.distplot(
-                df[col], hist=False, rug=False,
-                label=col, ax=ax, axlabel='beta')
     print(len(df.columns))
+
+    for col in df.columns:
+        if col != 'Name': # probe name
+            if len(df.columns) <= 30:
+                sns.distplot(
+                    df[col], hist=False, rug=False,
+                    label=col, ax=ax, axlabel='beta')
+            else:
+                sns.distplot(
+                    df[col], hist=False, rug=False, axlabel='beta')
+
     if len(df.columns) <= 30:
         plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
-    else:
-        ax.get_legend().set_visible(False)
+    #else:
+    #    ax.get_legend().set_visible(False)
+    #    print('suppressing legend')
+
     plt.title('Beta Density Plot')
     plt.grid()
     plt.xlabel('Beta values')
-    plt.ylabel('Count')
     if save:
         plt.savefig('beta.png')
     if not silent:
@@ -179,15 +186,40 @@ def beta_mds_plot(df, filter_stdev=1.5, verbose=True, save=False, silent=False):
         print("Starting MDS fit_transform. this may take a while.")
         LOGGER.info("Starting MDS fit_transform. this may take a while.")
 
+    # CHECK for missing probe values NaN
+    missing_probe_counts = df.isna().sum()
+    total_missing_probes = sum([i for i in missing_probe_counts])/len(df) # sum / columns
+    if sum([i for i in missing_probe_counts]) > 0:
+        df = df.dropna()
+        if verbose == True:
+            print("We found {0} probe(s) were missing values and removed them from MDS calculations.".format(total_missing_probes))
+        if silent == False:
+            LOGGER.info("{0} probe(s) were missing values removed from MDS calculations.".format(total_missing_probes))
+
     mds = MDS(n_jobs=-1, random_state=1, verbose=1)
     #n_jobs=-1 means "use all processors"
     mds_transformed = mds.fit_transform(df.values)
     # pass in df.values (a np.array) instead of a dataframe, as it processes much faster.
     # old range is used for plotting, with some margin on outside for chart readability
+    PSF = 2 # plot_scale_fator -- an empirical number to stretch the plot out and show clusters more easily.
+    if df.shape[0] < 40:
+        DOTSIZE = 16
+    elif 40 < df.shape[0] < 60:
+        DOTSIZE = 14
+    elif 40 < df.shape[0] < 60:
+        DOTSIZE = 12
+    elif 60 < df.shape[0] < 80:
+        DOTSIZE = 10
+    elif 80 < df.shape[0] < 100:
+        DOTSIZE = 8
+    elif 100 < df.shape[0] < 300:
+        DOTSIZE = 7
+    else:
+        DOTSIZE = 5
     old_X_range = [min(mds_transformed[:, 0]), max(mds_transformed[:, 0])]
     old_Y_range = [min(mds_transformed[:, 1]), max(mds_transformed[:, 1])]
-    old_X_range = [old_X_range[0] - 0.5*old_X_range[0], old_X_range[1] + 0.5*old_X_range[1]]
-    old_Y_range = [old_Y_range[0] - 0.5*old_Y_range[0], old_Y_range[1] + 0.5*old_Y_range[1]]
+    #old_X_range = [old_X_range[0] - PSF*old_X_range[0], old_X_range[1] + PSF*old_X_range[1]]
+    #old_Y_range = [old_Y_range[0] - PSF*old_Y_range[0], old_Y_range[1] + PSF*old_Y_range[1]]
     x_std, y_std = np.std(mds_transformed,axis=0)
     x_avg, y_avg = np.mean(mds_transformed,axis=0)
 
@@ -215,15 +247,25 @@ def beta_mds_plot(df, filter_stdev=1.5, verbose=True, save=False, silent=False):
                 df_indexes_to_exclude.append(idx)
             #pandas style: mds2 = mds_transformed[mds_transformed[:, 0] == class_number[:, :2]
         md2 = np.array(md2)
-        plt.figure(figsize=(12, 9))
+        fig = plt.figure(figsize=(12, 9))
         plt.title('MDS Plot of betas from methylation data')
         plt.grid()
-        plt.scatter(mds_transformed[:, 0], mds_transformed[:, 1], s=7)
+        plt.scatter(mds_transformed[:, 0], mds_transformed[:, 1], s=DOTSIZE)
         plt.scatter(md2[:, 0], md2[:, 1], s=5, c='red')
-        plt.xlim(old_X_range)
-        plt.ylim(old_Y_range)
+
+        x_range_min = PSF*old_X_range[0] if PSF*old_X_range[0] < minX else PSF*minX
+        x_range_max = PSF*old_X_range[1] if PSF*old_X_range[1] > maxX else PSF*maxX
+        y_range_min = PSF*old_Y_range[0] if PSF*old_Y_range[0] < minY else PSF*minY
+        y_range_max = PSF*old_Y_range[1] if PSF*old_Y_range[1] > maxY else PSF*maxY
+        plt.xlim([x_range_min, x_range_max])
+        plt.ylim([y_range_min, y_range_max])
         plt.xlabel('X')
         plt.ylabel('Y')
+        plt.vlines([minX, maxX], minY, maxY, color='red', linestyle=':')
+        plt.hlines([minY, maxY], minX, maxX, color='red', linestyle=':')
+        #line = mlines.Line2D([minX, minX], [minY, minY], color='red', linestyle='--')
+        #ax = fig.add_subplot(111)
+        #ax.add_line(line)
 
         if silent == True:
             # take the original dataframe (df) and remove samples that are outside the sample thresholds, returning a new dataframe
@@ -308,7 +350,6 @@ def mean_beta_compare(df1, df2, save=False, verbose=False, silent=False):
     plt.title('Mean Beta Plot (Compare pre vs post filtering)')
     plt.grid()
     plt.xlabel('Mean Beta')
-    plt.ylabel('Count')
     #plt.legend([line1, line2], ['pre','post'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
     if save:
         plt.savefig('mean_beta_compare.png')

diff --git a/setup.py b/setup.py
@@ -3,10 +3,11 @@
 
 setup(
     name='methQC',
-    version='0.2.1',
-    description='Quality Control (QC), Visualization/plotting, and postprocessing software for Illumina methylation array data. See https://life-epigenetics-methqc.readthedocs-hosted.com/en/latest/ for full documentation and examples.',
+    version='0.2.2',
+    description="""Quality Control (QC), Visualization/plotting, and postprocessing software for Illumina methylation array data.
+See https://life-epigenetics-methqc.readthedocs-hosted.com/en/latest/ for full documentation and examples.""",
     long_description=open('README.md').read(),
-    long_description_content_type='text/markdown',        
+    long_description_content_type='text/markdown',
     classifiers=[
         'Development Status :: 4 - Beta',
         'Environment :: Console',
@@ -34,7 +35,7 @@
         'seaborn',
         'matplotlib',
         'tqdm',
-        'scikit-learn',      
+        'scikit-learn',
     ],
     extras_require={
         'dev': [