# Somatic vs non-somatic classification

**Goal:** first work on the classifier
  
This notebook is divided in two parts:
* **1. Bla**

In [1]:
%run ../setup_environment.ipy
%run ../Selene_Job.ipy

Setup environment... done!


<span style="color:green">✅ Working on **impact-annotator_env** conda environment.</span>

In [2]:
summary = pd.DataFrame(columns=['test_accuracy', 'test_roc_auc', 'test_f1', 'test_average_precision'])
summary.index.name = 'method_name'

## Choose the model

In [3]:
from sklearn.model_selection import StratifiedKFold
cv_strategy = StratifiedKFold(n_splits=5)

from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(random_state=42, loss='deviance', max_depth=6, n_estimators=500)

## Get the data

In [4]:
# get raw data
impact = pd.read_csv('../../../data/annotated_final_IMPACT_mutations_180508.txt', sep='\t', low_memory=False)

In [5]:
# create the is_artefact class
impact['is_artefact'] = impact.confidence_class == "UNLIKELY"
get_table(impact.is_artefact)

Unnamed: 0,count_,freq_
False,187012,96.3%
True,7199,3.7%


In [6]:
def get_X_and_y(impact, features, categorical_features):
    data = impact.copy()
    data = data[features + ['is_artefact']]
    
    # transform categorical features
    numerical_features = [f for f in features if not f in categorical_features]
    data = pd.get_dummies(data, columns=categorical_features, sparse=True)

    # get reduced dataset
    positive_class_number = data[data.is_artefact].shape[0]
    data = pd.concat([data[data.is_artefact],
                        data[~data.is_artefact].iloc[:positive_class_number]], ignore_index=True)

    # shuffle data
    rng = np.random.RandomState(42)
    permutation = rng.permutation(len(data))
    data = data.iloc[permutation]
    data.reset_index(drop=True, inplace=True)

    # get features matrix X (n_samples x n_features) and target array y (n_samples)
    X = data.drop('is_artefact', axis=1)
    X = X.astype(float)
    y = data['is_artefact']

    get_table(y)
    
    return X, y

## No feature processing

In [7]:
features = [
#'Hugo_Symbol',
#'Chromosome',
#'Start_Position',
#'End_Position',
#'Consequence',
#'Variant_Type',
#'Reference_Allele',
#'Tumor_Seq_Allele2',
#'Tumor_Sample_Barcode',
#'cDNA_change',
#'HGVSp_Short',
't_depth',
't_vaf',
't_alt_count',
'n_depth',
'n_vaf',
'n_alt_count',
#'t_ref_plus_count',
#'t_ref_neg_count',
#'t_alt_plus_count',
#'t_alt_neg_count',
#'confidence_class',
'sample_coverage',
#'mut_key',
#'VAG_VT',
#'VAG_GENE',
#'VAG_cDNA_CHANGE',
#'VAG_PROTEIN_CHANGE',
#'VAG_EFFECT',
'VEP_Consequence',
#'VEP_SYMBOL',
#'VEP_HGVSc',
#'VEP_HGVSp',
#'VEP_Amino_acids',
'VEP_VARIANT_CLASS',
#'VEP_EXON',
#'VEP_INTRON',
'VEP_IMPACT',
'VEP_CLIN_SIG',
'VEP_COSMIC_CNT',
'VEP_gnomAD_AF',
#'sample_mut_key',
#'patient_key',
'frequency_in_normals',
#'VEP_SIFT_class',
#'VEP_SIFT_score',
#'VEP_PolyPhen_class',
#'VEP_PolyPhen_score',
'VEP_in_dbSNP',
'VEP_gnomAD_total_AF_AFR',
'VEP_gnomAD_total_AF_AMR',
'VEP_gnomAD_total_AF_ASJ',
'VEP_gnomAD_total_AF_EAS',
'VEP_gnomAD_total_AF_FIN',
'VEP_gnomAD_total_AF_NFE',
'VEP_gnomAD_total_AF_OTH',
'VEP_gnomAD_total_AF_max',
'VEP_gnomAD_total_AF',
'Kaviar_AF',
#'is_a_hotspot',
#'is_a_3d_hotspot',
#'oncogenic',
'gene_type',
#'is_artefact'
]

# transform categorical features
categorical_features = ['VEP_Consequence', 'VEP_VARIANT_CLASS', 'VEP_IMPACT', 'VEP_CLIN_SIG', 'VEP_in_dbSNP', 'gene_type']

In [8]:
features

['t_depth',
 't_vaf',
 't_alt_count',
 'n_depth',
 'n_vaf',
 'n_alt_count',
 'sample_coverage',
 'VEP_Consequence',
 'VEP_VARIANT_CLASS',
 'VEP_IMPACT',
 'VEP_CLIN_SIG',
 'VEP_COSMIC_CNT',
 'VEP_gnomAD_AF',
 'frequency_in_normals',
 'VEP_in_dbSNP',
 'VEP_gnomAD_total_AF_AFR',
 'VEP_gnomAD_total_AF_AMR',
 'VEP_gnomAD_total_AF_ASJ',
 'VEP_gnomAD_total_AF_EAS',
 'VEP_gnomAD_total_AF_FIN',
 'VEP_gnomAD_total_AF_NFE',
 'VEP_gnomAD_total_AF_OTH',
 'VEP_gnomAD_total_AF_max',
 'VEP_gnomAD_total_AF',
 'Kaviar_AF',
 'gene_type']

In [9]:
X, y = get_X_and_y(impact, features, categorical_features)
metrics = run_model(model, X, y, cv_strategy, n_jobs=5)
print_mean_metrics(metrics)

Run model... done! (37.57s)
▴ Mean accuracy    : 0.886 ± 0.004
▴ Mean ROC AUC     : 0.953 ± 0.005
▴ Mean F1-score    : 0.883 ± 0.004
▴ Average precision: 0.960 ± 0.004


In [10]:
summary.loc['no_processing'] = [metrics.test_accuracy.mean(), metrics.test_roc_auc.mean(), metrics.test_f1.mean(), metrics.test_average_precision.mean()]

## Without GnomAD populations

In [11]:
features = ['t_depth',
 't_vaf',
 't_alt_count',
 'n_depth',
 'n_vaf',
 'n_alt_count',
 'sample_coverage',
 'VEP_Consequence',
 'VEP_VARIANT_CLASS',
 'VEP_IMPACT',
 'VEP_CLIN_SIG',
 'VEP_COSMIC_CNT',
 'VEP_gnomAD_AF',
 'frequency_in_normals',
 'VEP_in_dbSNP',
 'VEP_gnomAD_total_AF_max',
 'VEP_gnomAD_total_AF',
 'Kaviar_AF',
 'gene_type']

In [12]:
X, y = get_X_and_y(impact, features, categorical_features)
metrics = run_model(model, X, y, cv_strategy, n_jobs=5)
print_mean_metrics(metrics)

Run model... done! (33.47s)
▴ Mean accuracy    : 0.887 ± 0.008
▴ Mean ROC AUC     : 0.953 ± 0.005
▴ Mean F1-score    : 0.884 ± 0.008
▴ Average precision: 0.960 ± 0.004


In [13]:
summary.loc['without_gnomAD_populations'] = [metrics.test_accuracy.mean(), metrics.test_roc_auc.mean(), metrics.test_f1.mean(), metrics.test_average_precision.mean()]

## Without `VEP_Consequence`, `VEP_IMPACT`, `VEP_CLIN_SIG`, `VEP_in_dbSNP`, `n_vaf`, `n_alt_count`

In [14]:
features = ['t_depth',
 't_vaf',
 't_alt_count',
 'n_depth',
 'sample_coverage',
 'VEP_VARIANT_CLASS',
 'VEP_COSMIC_CNT',
 'VEP_gnomAD_AF',
 'frequency_in_normals',
 'VEP_gnomAD_total_AF_max',
 'VEP_gnomAD_total_AF',
 'Kaviar_AF',
 'gene_type']
categorical_features=['gene_type', 'VEP_VARIANT_CLASS']

In [15]:
X, y = get_X_and_y(impact, features, categorical_features)
metrics = run_model(model, X, y, cv_strategy, n_jobs=5)
print_mean_metrics(metrics)

Run model... done! (21.27s)
▴ Mean accuracy    : 0.884 ± 0.008
▴ Mean ROC AUC     : 0.952 ± 0.004
▴ Mean F1-score    : 0.881 ± 0.010
▴ Average precision: 0.959 ± 0.004


In [16]:
summary.loc['without_VEP_additional_info'] = [metrics.test_accuracy.mean(), metrics.test_roc_auc.mean(), metrics.test_f1.mean(), metrics.test_average_precision.mean()]

## Without `VEP_gnomAD_AF`, `Kaviar_AF`

In [17]:
features = ['t_depth',
 't_vaf',
 't_alt_count',
 'n_depth',
 'sample_coverage',
 'VEP_VARIANT_CLASS',
 'VEP_COSMIC_CNT',
 'frequency_in_normals',
 'VEP_gnomAD_total_AF_max',
 'VEP_gnomAD_total_AF',
 'gene_type']
categorical_features=['VEP_VARIANT_CLASS', 'gene_type']

In [18]:
X, y = get_X_and_y(impact, features, categorical_features)
metrics = run_model(model, X, y, cv_strategy, n_jobs=5)
print_mean_metrics(metrics)

Run model... done! (22.37s)
▴ Mean accuracy    : 0.880 ± 0.006
▴ Mean ROC AUC     : 0.951 ± 0.005
▴ Mean F1-score    : 0.876 ± 0.007
▴ Average precision: 0.958 ± 0.003


In [20]:
summary.loc['without_VEP_additional_info_2'] = [metrics.test_accuracy.mean(), metrics.test_roc_auc.mean(), metrics.test_f1.mean(), metrics.test_average_precision.mean()]

## Simplify `VEP_VARIANT_CLASS`

In [21]:
impact['is_SNV'] = (impact.VEP_VARIANT_CLASS == 'SNV')

In [22]:
features = ['t_depth',
 't_vaf',
 't_alt_count',
 'n_depth',
 'sample_coverage',
 'is_SNV',
 'VEP_COSMIC_CNT',
 'frequency_in_normals',
 'VEP_gnomAD_total_AF_max',
 'VEP_gnomAD_total_AF',
 'gene_type']
categorical_features=['gene_type']

In [23]:
X, y = get_X_and_y(impact, features, categorical_features)
metrics = run_model(model, X, y, cv_strategy, n_jobs=5)
print_mean_metrics(metrics)

Run model... done! (22.17s)
▴ Mean accuracy    : 0.883 ± 0.008
▴ Mean ROC AUC     : 0.950 ± 0.006
▴ Mean F1-score    : 0.880 ± 0.010
▴ Average precision: 0.957 ± 0.004


In [24]:
summary.loc['simplified_VEP_VARIANT_CLASS'] = [metrics.test_accuracy.mean(), metrics.test_roc_auc.mean(), metrics.test_f1.mean(), metrics.test_average_precision.mean()]

## Tried

In [None]:
populations = ['VEP_gnomAD_total_AF_AFR', 'VEP_gnomAD_total_AF_AMR', 'VEP_gnomAD_total_AF_ASJ', 'VEP_gnomAD_total_AF_EAS', 'VEP_gnomAD_total_AF_FIN', 'VEP_gnomAD_total_AF_NFE', 'VEP_gnomAD_total_AF_OTH']
impact['population_number'] = impact.apply(lambda x: sum(x[populations] != 0), axis=1)

In [None]:
get_table(impact['population_number'])

In [None]:
impact['is_frequent_in_population'] = impact['population_number'] >= 3

Every unselected features tried and useless.

In [None]:
impact['strand_ratio'] = impact[['t_alt_neg_count', 't_alt_plus_count']].min(axis=1) / impact[['t_alt_neg_count', 't_alt_plus_count']].max(axis=1)

## Add `oncogenic` and `Chromosome`

In [None]:
features = ['t_depth',
 't_vaf',
 't_alt_count',
 'n_depth',
 'sample_coverage',
 'is_SNV',
 'VEP_COSMIC_CNT',
 'frequency_in_normals',
 'VEP_gnomAD_total_AF_max',
 'VEP_gnomAD_total_AF',
 'Chromosome',
 'oncogenic',
 'gene_type']

categorical_features = ['Chromosome', 'oncogenic', 'gene_type']

In [None]:
X, y = get_X_and_y(impact, features, categorical_features)
metrics = run_model(model, X, y, cv_strategy, n_jobs=5)
print_mean_metrics(metrics)

In [None]:
summary.loc['add_oncogenic_and_Chromosome'] = [metrics.test_accuracy.mean(), metrics.test_roc_auc.mean(), metrics.test_f1.mean(), metrics.test_average_precision.mean()]

## Study feature importance

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model.fit(X, y);

In [None]:
feature_importance = pd.DataFrame({'value': model.feature_importances_.tolist()}, index=X.columns.tolist())
feature_importance.sort_values(by='value', axis=0, inplace=True)

plt.figure(figsize = (10, 8))
feature_importance.value.plot.barh(width=0.85)

## Summary

In [None]:
summary.style.highlight_max(axis=0, color='yellow').set_precision(3)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 12))

s = summary.iloc[::-1].transpose().iloc[::-1]
colors = ['darkblue', 'purple', 'grey', 'maroon', 'crimson', 'salmon', 'darkgoldenrod', 'seagreen', 'mediumseagreen']
s.plot.barh(ax=ax, width=0.85, color=colors)

# print text results
for rect in ax.patches:
    ax.text(rect.get_width() + 0.01, rect.get_y() + rect.get_height() / 2, '%.3f' % rect.get_width(), ha='left', va='center', color=rect.get_facecolor(), fontsize=13)
    
# invert legend order
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], loc='center left', bbox_to_anchor=(1, 0.5), prop={'size':18})
ax.set_xlim(right=1.05);