In [2]:
%run Dataset_cleanup/cleanup_clinical.py

Clinical data set imported


 The following features do not provide any information: 
 ['Composite.Element.REF' 'ethnicity' 'gender' 'pathologicstage'
 'pathologyMstage' 'tumortissuesite'] 



 Variables that are not known at initial diagnosis: 
 ['daystodeath' 'daystolastfollowup' 'daystopsa' 'gleasonscore'
 'histologicaltype' 'numberoflymphnodes' 'pathologyTstage'
 'radiationtherapy' 'residualtumor' 'vitalstatus'] 


Variables that are known at the time of diagnosis:
 ['dateofinitialpathologicdiagnosis' 'psavalue' 'race' 'yearstobirth']

Dimensions of clinical dataframe: (499, 7)


In [3]:
%run Dataset_cleanup/cleanup_gene_counts.py

Gene Counts data set imported

Dimension of DataFrame: (497, 20501)


## Benchmark Analysis of initial clinical presentation for  predictors of metastasis

In [4]:
"""The Clinical data (including metastasis label) have a multitude of missing items."""
print("Total observations in original dataset:",clinical.shape[0])

not_labeled = y[y.isnull()]
y_labels = y[y.notnull()]

print("\nLabeled observations:",y_labels.shape[0],"\nUnlabeled observations removed:",not_labeled.shape[0])


Total observations in original dataset: 499

Labeled observations: 426 
Unlabeled observations removed: 73


In [5]:
clinical = clinical.loc[y_labels.index]  #filter only observations where metastasis state is known
print("Filtering out missing metastasis state labels left",clinical.shape[0],"observations")

clinical.dropna(inplace = True) # Remove all observations where clinical data is missing
print("Removing NaN rows leaves ",clinical.shape[0]," observations in the feature set.") 

y_labels_NAdrop = y_labels.loc[clinical.index] #remove metastasis labels where no clinical data now exists due to NaN removal.

Filtering out missing metastasis state labels left 426 observations
Removing NaN rows leaves  359  observations in the feature set.


In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer, accuracy_score

In [7]:
recaller = make_scorer(recall_score, greater_is_better=True, needs_proba=False, needs_threshold=False, pos_label='n1')
accuracy = make_scorer(accuracy_score, greater_is_better=True, needs_proba=False, needs_threshold=False)
clf = DecisionTreeClassifier(criterion='gini',
                            splitter='best',
                            max_depth=None,
                            min_samples_split=30,
                            min_samples_leaf=1,
                            min_weight_fraction_leaf=0.0,
                            max_features=None,
                            random_state=None,
                            max_leaf_nodes=None,
                            class_weight='balanced', #avoids the classifier being rewarded for choosing the most prevelant class each instance (n0)
                            presort=False)
recall_vals = cross_val_score(clf, clinical, y_labels_NAdrop, scoring=recaller, cv=5)
accuracy_vals = cross_val_score(clf, clinical, y_labels_NAdrop, scoring=accuracy, cv=5)
print('Recall_n1:  ',recall_vals.mean())

print('\nModel Accuracy score:  ',accuracy_vals.mean())
print("\nNull error rate for metastasis:  ",(sum(y_labels == 'n1') / len(y_labels)))

Recall_n1:   0.465384615385

Model Accuracy score:   0.533585761142

Null error rate for metastasis:   0.18544600939


## Exploratory Analysis using Gene TPM counts as predictors of Metastasis

### Wrangle TPM counts DF

In [35]:
print(gene_counts.shape)
print(len(y_labels.index))
X = gene_counts.loc[list(y_labels.index)]  #Only observations that also have a known metastasis state are kept.  
print(X.shape)
print(sum(np.isfinite(X.iloc[:,1])))

(497, 20501)
426
(426, 20501)
424


In [36]:
X.dropna(axis=0, inplace=True)
print(X.shape[0])
y_labels_NA_gc_dropped = y_labels[X.index]
print(len(y_labels_NA_gc_dropped.index))

424
424


In [37]:
def duplicate_TCGA_ID_test(X) :
    seen = set()
    uniq = []
    duplicates = []
    for x in X.index:
        if x not in seen:
            uniq.append(x)
            seen.add(x)
        else:
            duplicates.append(x)
    if len(duplicates) > 0 :
        print("Error in data set arrangement")
    else :
        print("Data set contains",X.shape[0],"uniquely barcoded observations.")

duplicate_TCGA_ID_test(X)

Data set contains 424 uniquely barcoded observations.


### Feature Reduction

In [38]:
from sklearn.feature_selection import SelectKBest, f_classif

In [44]:
selector = SelectKBest(f_classif, k = 423)
X_new = selector.fit_transform(X, y_labels_NA_gc_dropped)

  4780  4781  4782  4787  4790  4791  4794  4795  4801  4803  5259  6032
  6776  6780  6783  7445  7551  7632  7633  7634  7635  7636  7732  9275
  9277  9287  9291  9292  9419  9421  9423  9495 10092 11101 11929 12028
 12488 12527 12584 12593 12599 12601 12606 12635 12692 13749 13963 14015
 14129 14130 14131 14132 14726 14727 14729 15109 15111 15112 16536 16538
 16539 16541 16542 16543 16545 16546 16547 16548 16549 16550 16576 16605
 16608 16648 16668 16669 16670 16671 16672 16673 16674 16675 16676 16677
 16678 16679 16680 16681 16682 16683 16684 16685 16686 16687 16688 16689
 16690 16691 16692 16693 16694 16695 16696 16697 16698 16699 16700 16701
 16702 16703 16704 16705 16706 16707 16708 16709 16710 16711 16712 16713
 16714 16715 16716 16717 16718 16720 16721 16722 16723 16724 16725 16726
 16728 16729 16730 16731 16732 16733 16734 16735 16736 16737 16738 16739
 16740 16741 16742 16743 16745 16746 16747 16748 16749 16750 16751 16752
 16753 16754 16755 16756 16757 16759 16760 16761 16

In [48]:
print(X.shape)
print(X_new.shape)
print(X_new)

(424, 20501)
(424, 423)
[[ 112.2389366   784.47000993   27.12668753 ...,    6.39007634
     6.74376762  118.51111244]
 [  29.12621676  351.46732641   37.42225034 ...,    4.79947168
    12.98681113   77.82675851]
 [  46.43687813  485.68998888   34.63025745 ...,    5.42064553
     4.06548692   74.39838341]
 ..., 
 [  37.90975887  362.85053754   15.06284997 ...,    4.52237243
     3.96405492   71.91129512]
 [  47.60272866  306.40959445   36.60941898 ...,    4.71453724
     3.83495792   44.64735007]
 [  48.39539623  318.60920416   30.52207881 ...,    6.44777729
     3.26094535   46.80198272]]
