In [1]:
import os

In [2]:
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn import tree

In [3]:
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import feature_selection
from sklearn import svm
from sklearn import metrics
from sklearn import neighbors
from sklearn.svm import SVC

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.externals import joblib
from sklearn.linear_model import RANSACRegressor

In [5]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold

In [7]:
from astropy.stats import sigma_clipped_stats
import matplotlib.pyplot as plt

%matplotlib inline

In [8]:
import graphviz

In [9]:
pwd

'/global/u2/b/bos0109/run2_diaproc/notebooks'

In [6]:
cd ../run2_diaproc

/global/u2/b/bos0109/run2_diaproc


In [11]:
#from ml_tools import custom_funs_ml as cf

In [10]:
def experiment(clf, x, y, nfolds=10, printing=False, probs=False,
               train_final=False):
    # import ipdb; ipdb.set_trace()
    skf = StratifiedKFold(n_splits=nfolds)
    probabilities = None # np.array([])
    predictions = np.array([])
    y_testing = np.array([])

    results = {}
    for train, test in skf.split(x, y):

        x_train = x[train]
        y_train = y[train]
        clf.fit(x_train, y_train)

        x_test = x[test]
        y_test = y[test]
        pr = clf.predict(x_test)
        if probs:
            probas = clf.predict_proba(x_test)  #[:, 0]

            probabilities = (
                probas if probabilities is None else
                np.vstack([probabilities, probas]))
        predictions = np.hstack([predictions, pr])
        y_testing = np.hstack([y_testing, y_test])

    results['y_test'] = y_testing
    results['predictions'] = predictions
    if probs:
        results['probabilities'] = probabilities

    if printing:
        print(metrics.classification_report(y_testing, predictions))

    if probs:
        fpr, tpr, thresholds = metrics.roc_curve(
            y_testing, 1.-probabilities[:, 0], drop_intermediate=True)
        prec_rec_curve = metrics.precision_recall_curve(
            y_testing, 1.- probabilities[:, 0])
        roc_auc = metrics.auc(fpr, tpr)

        results['fpr'] = fpr
        results['tpr'] = tpr
        results['thresh'] = thresholds
        results['roc_auc'] = roc_auc
        results['prec_rec_curve'] = prec_rec_curve

    if train_final:
        clf.fit(x, y)

    results['model'] = clf
    results['confusion_matrix'] = metrics.confusion_matrix(y_testing, predictions)
    results['bacc'] = metrics.balanced_accuracy_score(y_testing, predictions)
    results['acc'] = metrics.accuracy_score(y_testing, predictions)
    results['aprec'] = metrics.average_precision_score(y_testing, predictions)
    results['prec'] = metrics.precision_score(y_testing, predictions)
    results['reca'] = metrics.recall_score(y_testing, predictions)
    results['f1'] = metrics.f1_score(y_testing, predictions)

    return results

In [153]:
reals = pd.read_csv('../results/reals_table.csv')
bogus = pd.read_csv('../results/bogus_table.csv').sample(frac=0.05)

In [154]:
reals['REAL'] = True
bogus['REAL'] = False

In [155]:
len(reals), len(bogus)

(1432, 31749)

In [87]:
flags = [acol for acol in reals.columns if 'flag' in acol]

In [156]:
cols = list(reals.columns)

In [158]:
for acol in ['id', 'Unnamed: 0', 'REAL', 'cxmatch', 'sn_row', 'match_ang_dist', 
             'sn_id', 'raft', 'sensor', 'filter', 'coord_ra', 'coord_dec']:
    cols.remove(acol)

In [159]:
for acol in ['ip_diffim_NaiveDipoleCentroid_pos_x', 'slot_Centroid_pos_x', 'ip_diffim_NaiveDipoleCentroid_pos_y', 'slot_Centroid_pos_y',
             'ip_diffim_NaiveDipoleCentroid_pos_xErr', 'slot_Centroid_pos_xErr', 'ip_diffim_NaiveDipoleCentroid_pos_yErr', 'slot_Centroid_pos_yErr']:
    cols.remove(acol)

In [160]:
X = pd.concat([reals, bogus])[cols]
Y = pd.concat([reals, bogus])['REAL']

In [161]:
len(flags)

103

In [174]:
x = X[flags].values.astype(bool)

In [175]:
clf = tree.DecisionTreeClassifier(criterion='gini',
                                  min_impurity_decrease=10e-9,
                                  class_weight='balanced',
                                  max_depth=None,
                                  presort=False)

In [176]:
rslts_c45 = experiment(clf, x, Y.values.astype(int), printing=True, nfolds=10, train_final=False)
c45 = rslts_c45['model']

              precision    recall  f1-score   support

         0.0       0.99      0.72      0.83     31749
         1.0       0.11      0.76      0.19      1432

   micro avg       0.72      0.72      0.72     33181
   macro avg       0.55      0.74      0.51     33181
weighted avg       0.95      0.72      0.80     33181



In [177]:
dot_data = tree.export_graphviz(c45, out_file='realbogus_c45.out',
                         feature_names=flags,
                         class_names=['REAL', 'BOGUS'],
                         filled=True, rounded=True,
                         special_characters=True)

In [178]:
clf = RandomForestClassifier(n_estimators=10, class_weight='balanced', criterion='gini', max_depth=5, min_impurity_decrease=10e-3)

In [179]:
rslts_rforest = experiment(clf, x, Y.values.astype(int), printing=True, nfolds=10, train_final=True)
rfo = rslts_rforest['model']

              precision    recall  f1-score   support

         0.0       0.98      0.69      0.81     31749
         1.0       0.10      0.75      0.18      1432

   micro avg       0.70      0.70      0.70     33181
   macro avg       0.54      0.72      0.50     33181
weighted avg       0.95      0.70      0.79     33181



In [141]:
graph = graphviz.Source(dot_data)
graph.render('Real-Bogus')

ExecutableNotFound: failed to execute ['dot', '-Tpdf', '-O', 'Real-Bogus'], make sure the Graphviz executables are on your systems' PATH

/usr/bin/sh: dot: command not found
