In [1]:
import pandas as pd
import numpy as np
import math as m
import random as rand
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from sklearn import linear_model as lm, metrics, ensemble as ens
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
import random
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")


# Clinical Data updated_ Alex

In [2]:
#DEFINING A FUNCTION TO UPDATE COLUMN NAMES LATER
def lower_no_space(word): 
    
    word = re.sub(' ', '_', word) 
    
    word = re.sub(r'\'', '', word) 
    
    word = re.sub(r'\(', '', word)
    
    word = re.sub(r'\)', '', word)
    
    word = re.sub('\?', '', word)
    
    word = re.sub('/', '_', word)
    
    word = word.lower()
    
    return word

In [3]:
#READ IN Updated CLINICAL DATA FOR LATER USE (CONVERTED TO .csv IN GOOGLE SHEETS)
df_clin_updated = pd.read_csv("Homebase_new_updated.csv", header = 1)

In [4]:
#RENAMING COLUMNS
df_clin_updated = df_clin_updated.rename(mapper = lower_no_space, axis = 1) 
df_clin_updated.rename(columns={'subject_sample_id':'sample_id'}, inplace=True)

In [5]:
#CHECK CLINICAL DATA BASICS
df_clin_updated.head()

Unnamed: 0,patient_id,data_access_group,survey_identifier,survey_timestamp,sample_id,date_of_birth,current_age_at_time_of_data_entry_-_can_answer_instead_of_dob,country_of_origin,country_of_residence,gender,...,age_at_date_of_relapse_disease_progression_4_can_answer_instead_of_date_of_relapse_disease_progression_4,date_of_relapse_disease_progression.4,age_at_date_of_relapse_disease_progression_5_can_answer_instead_of_date_of_relapse_disease_progression_5,disease_status_at_time_of_sampling,disease_status_at_time_of_data_entry,date_of_death,age_at_death_can_answer_instead_of_date_of_death,death_related_to_disease,cause_of_death,complete
0,795-1,MD Anderson,,,spz-20,1933-03-08,,,United States of America,Female,...,,,,Progressive disease,Deceased,2013-11-06,,Yes,,Complete
1,795-2,MD Anderson,,,spz-3,1930-09-04,,,United States of America,Female,...,,"12-16-2013, 05-19-2014, 09-16-2014, 09-09-2015",,Progressive disease,Deceased,2015-10-26,,Yes,,Complete
2,795-3,MD Anderson,,,spz-4,1926-12-27,,,United States of America,Male,...,,,,Stable disease,Deceased,2014-09-14,,Yes,,Complete
3,795-4,MD Anderson,,,spz-8,1945-08-17,,,United States of America,Male,...,,,,Progressive disease,Deceased,2018-01-05,,No,,Complete
4,795-5,MD Anderson,,,spz-10,1925-04-28,,,United States of America,Male,...,,,,Progressive disease,Deceased,2017-09-25,,No,,Complete


In [6]:
#Compute the age at initial diagnosis from date of birth and date_of_initial_diagnosis
df_clin_updated['date_of_birth'] = pd.to_datetime(df_clin_updated['date_of_birth'])
df_clin_updated['date_of_initial_diagnosis'] = pd.to_datetime(df_clin_updated['date_of_initial_diagnosis'])
df_clin_updated["age_at_initial_diagnosis"] = (pd.DatetimeIndex(df_clin_updated['date_of_initial_diagnosis']).year 
                        - pd.DatetimeIndex(df_clin_updated['date_of_birth']).year)

In [7]:
#Due to the abnormal in date of birth from the Stanford data, 
#Remove the age at initial diagonosis for data from Stanford & the one that has negative age 
df_clin_updated["age_at_initial_diagnosis"] = np.where(df_clin_updated['data_access_group'] == 'Stanford', np.nan, df_clin_updated["age_at_initial_diagnosis"])
df_clin_updated["age_at_initial_diagnosis"] = np.where(df_clin_updated["age_at_initial_diagnosis"] < 0, np.nan, df_clin_updated["age_at_initial_diagnosis"])


In [8]:
#Change the data type: date_of_birth, n, m 
df_clin_updated = df_clin_updated.astype({'t':'object', 'b':'object'})


In [9]:
#TONS OF DATA, PULL WHAT WE WANT
df_clin_updated_lean = df_clin_updated.drop(columns = [x for x in df_clin_updated.columns if x not in ['gender', 'race', \
                                       'country_of_residence', 'sample_id', 'ethnicity',\
                                        'age_at_initial_diagnosis',\
                                        'lymph_node_biopsy_performed',\
                                        'family_history_of_leukemia_lymphoma', \
                                        'has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical',\
                                        'cd4+:cd8+_ratio', 'total_lymphocyte_count', 'absolute_cd4+_count_per_ul',\
                                        '%cd4+cd26-', '%cd4+cd7-', 'tcr_clonality', 'tumor_cell_cd30+',\
                                        'large_cell_transformation', 'ldh_u_l', 'wbc_103_μl', 'rbc_106_μl',\
                                        'hematocrit_%', 'mcv_fl', 'mchc_g_dl', 'rdw_%', 'platelet_count_103_μl',\
                                        'segmented_neutrophil,_absolute_103_μl', 'lymphocyte,_absolute_103_μl',\
                                        'monocytes,_absolute_103_μl',\
                                        'basophils,_absolute_103_μl', 'segmented_neutrophils_%', 'lymphocytes_%',\
                                        'monocytes_%', 'eosinophils_%', 'basophils_%']])

#'predominant_lesion_type_at_diagnosis', 't', 'n', 'm', 'b', 'eosinophils,_absolute_103_μl'

In [10]:
# TURN YES/NO & POSITIVE/NEGATIVE TO DUMMIES
df_clin_updated_lean['lymph_node_biopsy_performed'] = \
df_clin_updated_lean['lymph_node_biopsy_performed'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['family_history_of_leukemia_lymphoma'] = \
df_clin_updated_lean['family_history_of_leukemia_lymphoma'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tumor_cell_cd30+'] = \
df_clin_updated_lean['tumor_cell_cd30+'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['large_cell_transformation'] = \
df_clin_updated_lean['large_cell_transformation'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tcr_clonality'] = \
df_clin_updated_lean['tcr_clonality'].apply(lambda x: 1 if x == 'Positive' else 0)

df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'] = \
df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'].apply(lambda x: 1 if x == 'Yes' else 0)

### df_lean: Preprocessed Genetic Data

In [11]:
# Read in the Preprocessed Genetic Data
df_lean = pd.read_csv ('stats_by_sample.csv')

In [12]:
#CHECK CLINICAL DATA BASICS
df_clin_updated.head()

Unnamed: 0,patient_id,data_access_group,survey_identifier,survey_timestamp,sample_id,date_of_birth,current_age_at_time_of_data_entry_-_can_answer_instead_of_dob,country_of_origin,country_of_residence,gender,...,date_of_relapse_disease_progression.4,age_at_date_of_relapse_disease_progression_5_can_answer_instead_of_date_of_relapse_disease_progression_5,disease_status_at_time_of_sampling,disease_status_at_time_of_data_entry,date_of_death,age_at_death_can_answer_instead_of_date_of_death,death_related_to_disease,cause_of_death,complete,age_at_initial_diagnosis
0,795-1,MD Anderson,,,spz-20,1933-03-08,,,United States of America,Female,...,,,Progressive disease,Deceased,2013-11-06,,Yes,,Complete,73.0
1,795-2,MD Anderson,,,spz-3,1930-09-04,,,United States of America,Female,...,"12-16-2013, 05-19-2014, 09-16-2014, 09-09-2015",,Progressive disease,Deceased,2015-10-26,,Yes,,Complete,81.0
2,795-3,MD Anderson,,,spz-4,1926-12-27,,,United States of America,Male,...,,,Stable disease,Deceased,2014-09-14,,Yes,,Complete,86.0
3,795-4,MD Anderson,,,spz-8,1945-08-17,,,United States of America,Male,...,,,Progressive disease,Deceased,2018-01-05,,No,,Complete,66.0
4,795-5,MD Anderson,,,spz-10,1925-04-28,,,United States of America,Male,...,,,Progressive disease,Deceased,2017-09-25,,No,,Complete,87.0


In [13]:
#TRANSFORM SAMPLE ID TO JOIN TO CLINICAL DATA
df_lean['sample_id'] = df_lean['sample_id'].apply(lambda x: re.sub('_', '-', x[:5]) if 'WES' in x else\
                                                  (x[:-10] if 'CTCL' in x else \
                                                  (x[:-13] if 'almeida' in x else\
                                                  ((x[-2:]+x[:-2])[:-15] if 'ungewickell' in x else\
                                                  ('-'.join([ele.lstrip('0').lower() for ele in x[:-10].split('-')]) if 'SPZ' in x else x)))))

# Merge (Updated)

In [14]:
#MERGE tbe updated CLINICAL, GENETIC DATA
df_all_updated = pd.merge(df_lean, df_clin_updated_lean, on='sample_id', how='left')

In [15]:
#IMPUTATION; "UNKNOWN" FOR CATEGORICAL, MEAN FILL-IN FOR CONTINUOUS
for col in df_clin_updated_lean.columns:
    if col in ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis', 't', 
              'n', 'm', 'b']:
        df_all_updated[col] = df_all_updated[col].fillna('unknown')
    elif col != 'sample_id':
        df_all_updated[col] = df_all_updated[col].fillna(np.mean(df_all_updated[col]))

In [16]:
#GET DUMMIES FOR CATEGORICALS
df_all_updated = pd.get_dummies(df_all_updated, columns = ['race', 'gender', 'country_of_residence', 'ethnicity'])


# Defining Features and Labels - For updated data

In [22]:
#DEFINE STANDARDSCALER FOR LATER USE
std_scl = StandardScaler()

In [23]:
# Define (Scaled/Normalized) Features and Labels
X_new = df_all_updated.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_new_scaled = std_scl.fit_transform(X_new)
X_new_norm = normalize(X_new)

y_new = df_all_updated.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [24]:
df_all_updated['outcome'].value_counts(normalize = True)

1    0.8
0    0.2
Name: outcome, dtype: float64

# Model Training without Lesion Type

### Repeated Stratified K-fold

In [25]:
# Stratified Version
from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)

### Logistic Regression

In [26]:
# Log
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=2)), ('m', lm.LogisticRegression(solver = 'liblinear', penalty = 'l1'))]
model_log = Pipeline(steps = steps)

acc_scores = cross_val_score(model_log, X_new_scaled, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_log, X_new_scaled, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("Logistic Regression")
print('n_components: 2, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 2, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

Logistic Regression
n_components: 2, accuracy:  0.8000308356460067
std for accuracy:  0.009206906337856044
n_components: 2, precision:  0.8015095450340594
std for precision:  0.010322887636825966


### Ramdom Forest

In [27]:
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=12)), ('m', ens.RandomForestClassifier(n_estimators = 500, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

acc_scores = cross_val_score(model_rf, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_rf, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("Random Forest")
print('n_components: 12, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 12, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

Random Forest
n_components: 12, accuracy:  0.8700585877274128
std for accuracy:  0.03299938575209889
n_components: 12, precision:  0.8928972326642266
std for precision:  0.030852267240559315


### Support Vector Machine

In [28]:
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=5)), ('svc', SVC(C = 50, gamma = 'auto', kernel = 'linear', probability = True))]
model_svc = Pipeline(steps = steps)

acc_scores = cross_val_score(model_svc, X_new_norm, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_svc, X_new_norm, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("SVC")
print('n_components: 2, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 2, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

SVC
n_components: 2, accuracy:  0.8200123342584027
std for accuracy:  0.03341616717461899
n_components: 2, precision:  0.8306923623460406
std for precision:  0.014007685572255887


### Ridge Classifier

In [29]:
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=5)), ('m', lm.RidgeClassifier(alpha = 0.1))]
model_rdg = Pipeline(steps = steps)

acc_scores = cross_val_score(model_rdg, X_new_norm, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_rdg, X_new_norm, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("Ridge Classifier")
print('n_components: 2, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 2, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

Ridge Classifier
n_components: 2, accuracy:  0.8279062596361393
std for accuracy:  0.030133211120398833
n_components: 2, precision:  0.8330157344060274
std for precision:  0.01989556419425147


### AdaBoost

In [35]:
from sklearn.ensemble import AdaBoostClassifier

rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
model_ada = AdaBoostClassifier(n_estimators=400, learning_rate = 1, algorithm = "SAMME.R")

acc_scores = cross_val_score(model_ada, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_ada, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("AdaBoost")
print('n_components: 12, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 12, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

AdaBoost
n_components: 12, accuracy:  0.8984736355226645
std for accuracy:  0.030665910425619128
n_components: 12, precision:  0.9146003395606959
std for precision:  0.03374731769643622


### XGBoost

In [36]:
from numpy import loadtxt
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
model_xg = XGBClassifier(eta = 0.1, max_depth = 6, scale_pos_weight = 1, eval_metric = "error", use_label_encoder = False)

acc_scores = cross_val_score(model_xg, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_xg, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("XGBoost")
print('n_components: 13, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 13, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int6

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBoost
n_components: 13, accuracy:  0.872062904717854
std for accuracy:  0.03589253300491555
n_components: 13, precision:  0.8819573209525036
std for precision:  0.03588150058182582


In [37]:
print("XGBoost")
print('n_components: 13, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 13, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

XGBoost
n_components: 13, accuracy:  0.872062904717854
std for accuracy:  0.03589253300491555
n_components: 13, precision:  0.8819573209525036
std for precision:  0.03588150058182582


### CATBoost

In [40]:
from catboost import CatBoostClassifier
warnings.filterwarnings("ignore")

rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=12)), ('cat', CatBoostClassifier(iterations = 250))]
model_cat = Pipeline(steps = steps)

acc_scores = cross_val_score(model_cat, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_cat, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


Learning rate set to 0.013322
0:	learn: 0.6801639	total: 60.2ms	remaining: 15s
1:	learn: 0.6696105	total: 61.6ms	remaining: 7.64s
2:	learn: 0.6574682	total: 62.6ms	remaining: 5.16s
3:	learn: 0.6475609	total: 64.1ms	remaining: 3.94s
4:	learn: 0.6371835	total: 65.4ms	remaining: 3.2s
5:	learn: 0.6276413	total: 66.6ms	remaining: 2.71s
6:	learn: 0.6167122	total: 67.6ms	remaining: 2.35s
7:	learn: 0.6074386	total: 69.1ms	remaining: 2.09s
8:	learn: 0.5966179	total: 70ms	remaining: 1.87s
9:	learn: 0.5868526	total: 71.1ms	remaining: 1.71s
10:	learn: 0.5759593	total: 72.3ms	remaining: 1.57s
11:	learn: 0.5671855	total: 73.6ms	remaining: 1.46s
12:	learn: 0.5592116	total: 74.5ms	remaining: 1.36s
13:	learn: 0.5504493	total: 76.2ms	remaining: 1.28s
14:	learn: 0.5413196	total: 77.2ms	remaining: 1.21s
15:	learn: 0.5339438	total: 79.5ms	remaining: 1.16s
16:	learn: 0.5272352	total: 81.8ms	remaining: 1.12s
17:	learn: 0.5183632	total: 83.4ms	remaining: 1.07s
18:	learn: 0.5112241	total: 84.5ms	remaining: 1.0

Learning rate set to 0.013383
0:	learn: 0.6814438	total: 59.3ms	remaining: 14.8s
1:	learn: 0.6741324	total: 60.6ms	remaining: 7.51s
2:	learn: 0.6637401	total: 61.7ms	remaining: 5.08s
3:	learn: 0.6547586	total: 62.6ms	remaining: 3.85s
4:	learn: 0.6457442	total: 63.7ms	remaining: 3.12s
5:	learn: 0.6377696	total: 65ms	remaining: 2.64s
6:	learn: 0.6295002	total: 65.8ms	remaining: 2.28s
7:	learn: 0.6218434	total: 66.7ms	remaining: 2.02s
8:	learn: 0.6133556	total: 67.6ms	remaining: 1.81s
9:	learn: 0.6060931	total: 69.6ms	remaining: 1.67s
10:	learn: 0.5977168	total: 70.4ms	remaining: 1.53s
11:	learn: 0.5900308	total: 71.3ms	remaining: 1.41s
12:	learn: 0.5841102	total: 72.1ms	remaining: 1.31s
13:	learn: 0.5728037	total: 72.9ms	remaining: 1.23s
14:	learn: 0.5665246	total: 73.8ms	remaining: 1.16s
15:	learn: 0.5592869	total: 74.7ms	remaining: 1.09s
16:	learn: 0.5525318	total: 75.6ms	remaining: 1.03s
17:	learn: 0.5451930	total: 76.5ms	remaining: 986ms
18:	learn: 0.5363881	total: 77.5ms	remaining: 

accuracy:  0.8757323465926613
std for accuracy:  0.038045614358241005
precision:  0.8948350107325539
std for precision:  0.03170870780923739


In [41]:
print('CATBoost:')
print('accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

CATBoost:
accuracy:  0.8757323465926613
std for accuracy:  0.038045614358241005
precision:  0.8948350107325539
std for precision:  0.03170870780923739
