In [None]:
import pandas as pd
import numpy as np
import math as m
import random as rand
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from sklearn import linear_model as lm, metrics, ensemble as ens
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
import random
from collections import defaultdict
from itertools import islice
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools.tools import add_constant
import warnings


In [None]:
#DEFINING A FUNCTION TO UPDATE COLUMN NAMES LATER
def lower_no_space(word): 
    
    word = re.sub(' ', '_', word) 
    
    word = re.sub(r'\'', '', word) 
    
    word = re.sub(r'\(', '', word)
    
    word = re.sub(r'\)', '', word)
    
    word = re.sub('\?', '', word)
    
    word = re.sub('/', '_', word)
    
    word = word.lower()
    
    return word

In [None]:
#READ IN Updated CLINICAL DATA FOR LATER USE (CONVERTED TO .csv IN GOOGLE SHEETS)
df_clin_updated = pd.read_csv("Homebase_new_updated.csv", header = 1)

In [None]:
#RENAMING COLUMNS
df_clin_updated = df_clin_updated.rename(mapper = lower_no_space, axis = 1) 
df_clin_updated.rename(columns={'subject_sample_id':'sample_id'}, inplace=True)

In [None]:
#Compute the age at initial diagnosis from date of birth and date_of_initial_diagnosis
df_clin_updated['date_of_birth'] = pd.to_datetime(df_clin_updated['date_of_birth'])
df_clin_updated['date_of_initial_diagnosis'] = pd.to_datetime(df_clin_updated['date_of_initial_diagnosis'])
df_clin_updated["age_at_initial_diagnosis"] = (pd.DatetimeIndex(df_clin_updated['date_of_initial_diagnosis']).year 
                        - pd.DatetimeIndex(df_clin_updated['date_of_birth']).year)

In [None]:
#Due to the abnormal in date of birth from the Stanford data, 
#Remove the age at initial diagonosis for data from Stanford & the one that has negative age 
df_clin_updated["age_at_initial_diagnosis"] = np.where(df_clin_updated['data_access_group'] == 'Stanford', np.nan, df_clin_updated["age_at_initial_diagnosis"])
df_clin_updated["age_at_initial_diagnosis"] = np.where(df_clin_updated["age_at_initial_diagnosis"] < 0, np.nan, df_clin_updated["age_at_initial_diagnosis"])


In [None]:
#Change the data type: date_of_birth, n, m 
df_clin_updated = df_clin_updated.astype({'t':'object', 'b':'object'})


In [None]:
#TONS OF DATA, PULL WHAT WE WANT
df_clin_updated_lean = df_clin_updated.drop(columns = [x for x in df_clin_updated.columns if x not in ['gender', 'race', \
                                       'country_of_residence', 'sample_id', 'ethnicity',\
                                        'age_at_initial_diagnosis', 't', 'n', 'm', 'b',\
                                        'predominant_lesion_type_at_diagnosis','lymph_node_biopsy_performed',\
                                        'family_history_of_leukemia_lymphoma', \
                                        'has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical',\
                                        'cd4+:cd8+_ratio', 'total_lymphocyte_count', 'absolute_cd4+_count_per_ul',\
                                        '%cd4+cd26-', '%cd4+cd7-', 'tcr_clonality', 'tumor_cell_cd30+',\
                                        'large_cell_transformation', 'ldh_u_l', 'wbc_103_μl', 'rbc_106_μl',\
                                        'hematocrit_%', 'mcv_fl', 'mchc_g_dl', 'rdw_%', 'platelet_count_103_μl',\
                                        'segmented_neutrophil,_absolute_103_μl', 'lymphocyte,_absolute_103_μl',\
                                        'monocytes,_absolute_103_μl', 'eosinophils,_absolute_103_μl',\
                                        'basophils,_absolute_103_μl', 'segmented_neutrophils_%', 'lymphocytes_%',\
                                        'monocytes_%', 'eosinophils_%', 'basophils_%']])

In [None]:
# TURN YES/NO & POSITIVE/NEGATIVE TO DUMMIES
df_clin_updated_lean['lymph_node_biopsy_performed'] = \
df_clin_updated_lean['lymph_node_biopsy_performed'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['family_history_of_leukemia_lymphoma'] = \
df_clin_updated_lean['family_history_of_leukemia_lymphoma'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tumor_cell_cd30+'] = \
df_clin_updated_lean['tumor_cell_cd30+'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['large_cell_transformation'] = \
df_clin_updated_lean['large_cell_transformation'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tcr_clonality'] = \
df_clin_updated_lean['tcr_clonality'].apply(lambda x: 1 if x == 'Positive' else 0)

df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'] = \
df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
# Read in the Preprocessed Genetic Data
df_lean = pd.read_csv ('stats_by_sample.csv', index_col = 0)
df_lean.head()

In [None]:
#TRANSFORM SAMPLE ID TO JOIN TO CLINICAL DATA
df_lean['sample_id'] = df_lean['sample_id'].apply(lambda x: re.sub('_', '-', x[:5]) if 'WES' in x else\
                                                  (x[:-10] if 'CTCL' in x else \
                                                  (x[:-13] if 'almeida' in x else\
                                                  ((x[-2:]+x[:-2])[:-15] if 'ungewickell' in x else\
                                                  ('-'.join([ele.lstrip('0').lower() for ele in x[:-10].split('-')]) if 'SPZ' in x else x)))))

In [None]:
#MERGE tbe updated CLINICAL, GENETIC DATA
df_all_updated = pd.merge(df_lean, df_clin_updated_lean, on='sample_id', how='left')

In [None]:
#IMPUTATION; "UNKNOWN" FOR CATEGORICAL, MEAN FILL-IN FOR CONTINUOUS
for col in df_clin_updated_lean.columns:
    if col in ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis', 't', 
              'n', 'm', 'b']:
        df_all_updated[col] = df_all_updated[col].fillna('unknown')
    elif col != 'sample_id':
        df_all_updated[col] = df_all_updated[col].fillna(np.mean(df_all_updated[col]))

In [None]:
#GET DUMMIES FOR CATEGORICALS
df_all_updated = pd.get_dummies(df_all_updated, columns = ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis', 
                                                          't', 'n', 'm', 'b'])


In [None]:
# # WRITE A PROGRAM TO CHANGE THE MF/SS GENETIC DATA LARRY
# df_ss = pd.read_csv("ss_gen_data.csv", header = 2)
# df_mf = pd.read_csv("mf_gen_data.csv", header = 2)

In [None]:
# #RENAMING COLUMNS
# df_ss = df_ss.rename(mapper = lower_no_space, axis = 1) 

# #AND AGAIN
# df_mf = df_mf.rename(mapper = lower_no_space, axis = 1) 

In [None]:
# df_gen = pd.read_csv("full_gen.csv") #SEE GEN_DF_SETUP

In [None]:
# spz-10, spz-6, spz-5, spz-9, spz-11, spz-17, spz-29, spz-19, spz-27

In [None]:
# df_gen['outcome'] = df_gen['sample_id'].apply(lambda x: 1 if (x in set(df_ss['sample_id'])) & \
#                                               (x not in ['SPZ-010__wang__SS', 'SPZ-006__wang__SS', \
#                                                          'SPZ-005__wang__SS', 'SPZ-009__wang__SS', \
#                                                          'SPZ-011__wang__SS', 'SPZ-017__wang__SS', \
#                                                          'SPZ-029__wang__SS', 'SPZ-019__wang__SS', \
#                                                          'SPZ-027__wang__SS'])
#                                               else 0)



In [None]:
# df_gen = df_gen[df_gen['chrom'] != 'Chromosome']
# df_gen = df_gen[df_gen['gene_symbol'] != '#version 2.4'] #WEIRD OBS IN DATA
# df_gen['chrom'] = df_gen['chrom'].apply(lambda x: '19' if 'gl000209' in str(x) else str(x)) #WEIRD OBS AGAIN
# df_gen = df_gen.drop(columns = ['phred']) #USING ONLY RAWSCORE FOR NOW

In [None]:
# df_gen['pos'] = df_gen['pos'].astype('float')
# df_gen = df_gen[~df_gen['pos'].isna()]


In [None]:
# # num_sections = m.ceil(np.max((df_gen['pos'].values.tolist()))/10000000)
# #^^CAN USE TO DOUBLE CHECK THAT WE HAVE THE RIGHT NUMBER OF SECTIONS

# df_gen['section'] = df_gen['pos'].apply(lambda x: m.ceil(x/1000000))
# df_gen['section'] = df_gen['section'].astype('string')
# df_gen['section'] = df_gen['section'].astype('string') + '_' + 'chrom' + '_' + df_gen['chrom']


In [None]:
# #CHECK GENERAL SHAPE
# df_gen.head()

In [None]:
# df_gen['outcome'].value_counts()

In [None]:
# #CREATING VAR W/ NEG SCORES = 0;
# #SITE SAYS NEG SCORE MEANS VERY UNLIKELY TO BE HARMFUL, SO I COULD SEE THEM NOT "OFFSETTING" HIGH POSITIVE SCORES
# #SO, WANT TO TRY ONE WHERE THEY WON'T WHEN SUMMING OVER A LARGER AREA
# df_gen['non_neg_rawscore'] = df_gen['rawscore'].apply(lambda x: 0 if x <= 0 else x)


In [None]:
# #GOING TO USE A SERIES OF PIVOTS TO MAKE A LEANER DF
# def make_spec_pivot(magnitude, score, function, name):
#     #THESE ARE THE "LEVELS" ON WHICH WE WANT AGGED SCORES
#     if magnitude == 'gene_symbol':
#         prefix = 'gene_'
#     elif magnitude == 'chrom':
#         prefix = 'chromosome_'
#     elif magnitude == 'section':
#         prefix = 'section_'
    
#     #DON'T WANT TO CHANGE ORIGINAL DF
#     df = df_gen.copy()
    
#     #GET JUST PERSON, "LEVEL", SCORE
#     df = df.drop(columns = [x for x in df.columns if x != 'sample_id' and\
#                                 x != magnitude and x != score])
    
#     #RESHAPE DATAFRAME
#     df = df.pivot_table(index = 'sample_id', columns = [magnitude], values = [score], aggfunc = function).reset_index()
#     df.reset_index(inplace = True)
    
#     #RENAME RESULTING COLUMNS
#     df.columns = [' '.join(col).strip() for col in df.columns.values]
#     df = df.drop(columns = ['index'])
    
#     #USING GENERAL LOGIC I FOUND ONLINE, ADJUSTING FOR SPECIFIC OUTPUT OF SUMS VS. PERCENTILES
#     if name == 'sum':
#         if score == 'rawscore':
#             df.columns = [prefix + x[9:] + '_' + x[:8] if 'score' in x else x for x in df.columns]
#         else:
#             df.columns = [prefix + x[17:] + '_' + x[:16] if 'score' in x else x for x in df.columns]
#     elif name == 'nty':
#         if score == 'rawscore':
#             df.columns = ['nty_' + prefix + x[9:] + '_' + x[:8] if 'score' in x else x for x in df.columns]
#         else:
#             df.columns = ['nty_' + prefix + x[17:] + '_' + x[:16] if 'score' in x else x for x in df.columns]
#     elif name == 'med':
#         if score == 'rawscore':
#             df.columns = ['med_' + prefix + x[9:] + '_' + x[:8] if 'score' in x else x for x in df.columns]
#         else:
#             df.columns = ['med_' + prefix + x[17:] + '_' + x[:16] if 'score' in x else x for x in df.columns]
    
#     #IMPUTING 0s FOR NULLS
#     df = df.fillna(0)
#     return df
    

In [None]:
# #GET INITIAL LEAN DF FOR FOUNDATION, USING GENE-LEVEL SCORES SUMMED
# df_lean = pd.merge(make_spec_pivot('gene_symbol', 'rawscore', np.sum, 'sum'),\
#                    make_spec_pivot('gene_symbol', 'non_neg_rawscore', np.sum, 'sum'),\
#                    on=['sample_id'])

# #ADD CHROMSOME/SECTION LEVEL STATS
# for mag in ['chrom', 'section']:
    
#     #BOTH SCORE TYPES
#     for scr in ['rawscore', 'non_neg_rawscore']:
        
#         #SUMS AND MEDIANS
#         for func in [np.sum, lambda x: np.percentile(x, 50)]:
#             if func == np.sum:
#                 nm = 'sum'
#             elif func != np.sum:
#                 nm = 'med'
            
#             #SEE FUNC DEF ABOVE
#             df_latest = make_spec_pivot(mag, scr, func, nm)
            
#             #ADD OUTPUT TO CURRENT LEAN DF
#             df_lean = pd.merge(df_lean, df_latest, on = ['sample_id'])

# #HAD HARD TIME GETTING 90TH PCTL TO WORK IN FOR LOOP SO JUST GAVE IT ITS OWN SECTION
# for scr in ['rawscore', 'non_neg_rawscore']:
#     df_latest = make_spec_pivot('chrom', scr, lambda x: np.percentile(x, 90), 'nty')
#     df_lean = pd.merge(df_lean, df_latest, on = ['sample_id'])

In [None]:
# #ALSO WANT #OF MUTATIONS ON CHROMOSOME; UPDATE CODE FOR EFFICIENCY LATER
# for chrom in set(list(df_gen['chrom'].values)):
#     df_lean['chromosome_' + str(chrom) + '_mutations'] = df_lean['sample_id'].apply(lambda x:\
#                                                                                len(df_gen[(df_gen['sample_id'] == x)\
#                                                                                    & (df_gen['chrom'] == chrom)]))

In [None]:
# #ABOVE TRANSFORMATIONS DROPPED OUTCOME VAR FOR SAMPLE IDs, RE-ADDING HERE
# df_outcome = df_gen.copy()
# df_outcome = df_outcome.drop(columns = [x for x in df_outcome.columns if 'sample_id' not in x and\
#                                        'outcome' not in x])
# df_outcome = df_outcome.drop_duplicates()
# df_lean = pd.merge(df_lean, df_outcome, on = ['sample_id'])

In [None]:
# # DROP ONE EACH OF GENE, SECTION, CHROMOSOME SCORE FOR COLINEARITY 
# df_lean = df_lean.drop(columns = [\
# random.choice([x for x in df_lean.columns if 'gene_' in x and 'non_neg_rawscore' in x]),\
# random.choice([x for x in df_lean.columns if 'gene_' in x and 'rawscore' in x and 'non_neg' not in x]),\
# random.choice([x for x in df_lean.columns if 'section_' in x and 'non_neg_rawscore' in x]),\
# random.choice([x for x in df_lean.columns if 'section_' in x and 'rawscore' in x and 'non_neg' not in x]),\
# random.choice([x for x in df_lean.columns if 'chromosome_' in x and 'non_neg_rawscore' in x]),\
# random.choice([x for x in df_lean.columns if 'chromosome_' in x and 'rawscore' in x and 'non_neg' not in x])                             
# ])

In [None]:
# # USING 0 FILL IN FOR SCORES (NO SCORES MEANS NO MUTATION, PRESUMABLY)
# for col in df_lean.columns:
#     if 'rawscore' in col:
#         df_lean[col] = df_lean[col].fillna(0)

In [None]:
# df_lean = df_lean.drop(columns = [x for x in df_lean.columns if 'chromosome_nan' in x]) #WEIRD ONEOFF, CAN'T EXPLAIN

In [None]:
#SAVE A CSV WITH NEW DF
# df_lean.to_csv("stats_by_sample.csv")

In [None]:
#CHECK DF BASICS
# df_lean

In [None]:
df_lean['outcome'].value_counts()

In [None]:
#PLOT RF FI FUNCTION I FOUND ONLINE
def plot_feature_importance(importance, names, model_type, name, threshold):
    
    #Create arrays from freature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #CREATE A DATAFRAME USING A DICTIONARY
    data = {'feature_names': feature_names, 'feature_importance': feature_importance}
    fi_df = pd.DataFrame(data)
    
    #SORT THE DF IN ORDER DECREASING FI
    fi_df.sort_values(by = ['feature_importance'], ascending = False, inplace = True)
    
    #filter
    fi_df = fi_df[fi_df['feature_importance'] >= threshold]
    
    #DEFINE SIZE OF BAR PLOT
    plt.figure(figsize = (5, 6))
    
    #PLOT SEABORN BAR CHART
    sns.barplot(x = fi_df['feature_importance'], y = fi_df['feature_names'])
    
    #ADD CHART LABELS
    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')
    plt.savefig(name, bbox_inches='tight')

In [None]:
# #TRANSFORM SAMPLE ID TO JOIN TO CLINICAL DATA
# df_lean['sample_id'] = df_lean['sample_id'].apply(lambda x: re.sub('_', '-', x[:5]) if 'WES' in x else\
#                                                   (x[:-10] if 'CTCL' in x else \
#                                                    ('-'.join([ele.lstrip('0').lower() for ele in x[:-10].split('-')]) if 'SPZ' in x else x)))



In [None]:
# df_clin[df_clin['clinical_subtype_or_variant'] != 'Sezary syndrome']['predominant_lesion_type_at_diagnosis'].value_counts()
# df_clin['clinical_subtype_or_variant'].value_counts()
# SS - Patch = 14, Erythroderma = 12, Plaque = 2, Tumor = 1
# MF - Patch = = 6, Erhthroderma = 3

In [None]:
# for col in df_clin.columns:
#     print(col)

In [None]:
#CHECK CLINICAL DATA BASICS
# df_clin

In [None]:
# #TONS OF DATA, PULL WHAT WE WANT
# df_clin_lean = df_clin.drop(columns = [x for x in df_clin.columns if x not in ['gender', 'race', \
#                                        'country_of_residence', 'sample_id', 'ethnicity',\
#                                         'age_at_initial_diagnosis', 't', 'n', 'm', 'b',\
#                                         'predominant_lesion_type_at_diagnosis','lymph_node_biopsy_performed',\
#                                         'family_history_of_leukemia_lymphoma', \
#                                         'has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical',\
#                                         'cd4+:cd8+_ratio', 'total_lymphocyte_count', 'absolute_cd4+_count_per_ul',\
#                                         '%cd4+cd26-', '%cd4+cd7-', 'tcr_clonality', 'tumor_cell_cd30+',\
#                                         'large_cell_transformation', 'ldh_u_l', 'wbc_103_μl', 'rbc_106_μl',\
#                                         'hematocrit_%', 'mcv_fl', 'mchc_g_dl', 'rdw_%', 'platelet_count_103_μl',\
#                                         'segmented_neutrophil,_absolute_103_μl', 'lymphocyte,_absolute_103_μl',\
#                                         'monocytes,_absolute_103_μl', 'eosinophils,_absolute_103_μl',\
#                                         'basophils,_absolute_103_μl', 'segmented_neutrophils_%', 'lymphocytes_%',\
#                                         'monocytes_%', 'eosinophils_%', 'basophils_%']])
                                       
                                       
                                       
                                       

In [None]:
# df_clin_lean['lymph_node_biopsy_performed'] = \
# df_clin_lean['lymph_node_biopsy_performed'].apply(lambda x: 1 if x == 'Yes' else 0)

# df_clin_lean['family_history_of_leukemia_lymphoma'] = \
# df_clin_lean['family_history_of_leukemia_lymphoma'].apply(lambda x: 1 if x == 'Yes' else 0)

# df_clin_lean['tumor_cell_cd30+'] = \
# df_clin_lean['tumor_cell_cd30+'].apply(lambda x: 1 if x == 'Yes' else 0)

# df_clin_lean['large_cell_transformation'] = \
# df_clin_lean['large_cell_transformation'].apply(lambda x: 1 if x == 'Yes' else 0)

# df_clin_lean['tcr_clonality'] = \
# df_clin_lean['tcr_clonality'].apply(lambda x: 1 if x == 'Positive' else 0)

# df_clin_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'] = \
# df_clin_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
#MERGE CLINICAL, GENETIC DATA
# df_all = pd.merge(df_lean, df_clin_lean, on='sample_id', how='left')

In [None]:
# for col in df_clin_lean.columns:
#     if col in ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis', 't', 
#               'n', 'm', 'b']:
#         df_all[col] = df_all[col].fillna('unknown')
#     elif col != 'sample_id':
#         df_all[col] = df_all[col].fillna(np.mean(df_all[col]))

In [None]:
# #IMPUTATION; "UNKNOWN" FOR CATEGORICAL, MEAN FILL-IN FOR CONTINUOUS
# df_all['race'] = df_all['race'].fillna('unknown')
# df_all['gender'] = df_all['gender'].fillna('unknown')
# df_all['country_of_residence'] = df_all['country_of_residence'].fillna('unknown')
# df_all['ethnicity'] = df_all['ethnicity'].fillna('unknown')
# df_all['lymph_node_biopsy_performed'] = df_all['lymph_node_biopsy_performed'].fillna('unknown')
# df_all['predominant_lesion_type_at_diagnosis'] = df_all['predominant_lesion_type_at_diagnosis'].fillna('unknown')
# df_all['age_at_initial_diagnosis'] = df_all['age_at_initial_diagnosis'].fillna(np.mean(df_clin_lean['age_at_initial_diagnosis']))
# df_all['t'] = df_all['t'].fillna(np.mean(df_clin_lean['t']))
# df_all['n'] = df_all['n'].fillna(np.mean(df_clin_lean['n']))
# df_all['m'] = df_all['m'].fillna(np.mean(df_clin_lean['m']))
# df_all['b'] = df_all['b'].fillna(np.mean(df_clin_lean['b']))



In [None]:
# df_all[df_all['outcome'] == 0]['predominant_lesion_type_at_diagnosis'].value_counts()

In [None]:
# #GET DUMMIES FOR CATEGORICALS
# df_all = pd.get_dummies(df_all, columns = ['race', 'gender', 'country_of_residence', 'ethnicity',\
#                                            'predominant_lesion_type_at_diagnosis', 't', 'n', 'm', 'b'])
# # df_all.to_csv("stats_by_sample.csv")
# #DEFINE STANDARDSCALER FOR LATER USE
# # std_scl = StandardScaler()

In [None]:
df_all_updated.shape

In [None]:
df_all_updated = df_all_updated.drop(columns = [x for x in df_all_updated.columns if ('rawscore' in x and 'non_neg' not in x)])
df_all_updated = df_all_updated.drop(columns = [x for x in df_all_updated.columns if 'med_' in x or 'nty' in x])

# std_scl = StandardScaler()

In [None]:
#DEFINE STANDARDSCALER FOR LATER USE
std_scl = StandardScaler()

# Define (Scaled/Normalized) Features and Labels
X_new = df_all_updated.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_new_scaled = std_scl.fit_transform(X_new)
X_new_norm = normalize(X_new)

y_new = df_all_updated.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [None]:
df_all_updated['outcome'].value_counts(normalize = True)

In [None]:
# Stratified Version
from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)

In [None]:
log = lm.LogisticRegression()
acc_scores = cross_val_score(log, X_new_scaled, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(log, X_new_scaled, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

In [None]:
# RANDOM FOREST (rskf)
rf = ens.RandomForestClassifier()
acc_scores = cross_val_score(rf, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(rf, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

In [None]:
#RIDGE
rdg = lm.RidgeClassifier()
acc_scores = cross_val_score(rdg, X_new_norm, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(rdg, X_new_norm, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('ridge accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('ridge precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

In [None]:
#Support Vector Machine
for kern in ['linear', 'poly', 'rbf', 'sigmoid']:
    
    svc = SVC(kernel = kern, probability = True)
    
    acc_scores = cross_val_score(svc, X_new_norm, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
    prec_scores = cross_val_score(svc, X_new_norm, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
    print(kern, ' accuracy: ', np.mean(acc_scores))
    print(kern, ' std for accuracy: ', np.std(acc_scores))
    print(kern, ' precision: ', np.mean(prec_scores))
    print(kern, ' std for precision: ', np.std(prec_scores))

In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")

model = XGBClassifier(eval_metric = "error", use_label_encoder = False)
acc_scores = cross_val_score(model, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

In [None]:
#Ada Boost
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()
acc_scores = cross_val_score(model, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

In [None]:
for kern in ['linear', 'poly', 'rbf', 'sigmoid']:
    
    svc = SVC(kernel = kern, probability = True)
    
    acc_scores = cross_val_score(svc, norm_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
    prec_scores = cross_val_score(svc, norm_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
    print(kern, ' accuracy: ', np.mean(acc_scores))
    print(kern, ' precision: ', np.mean(prec_scores))

In [None]:
#TRY DROPPING GENES FOR RISK OF OVERFITTING
df_sect_only = df_all_updated.copy()
df_sect_only = df_sect_only.drop(columns = [x for x in df_sect_only.columns if 'gene_'  in x])

In [None]:
ex = df_sect_only.drop(columns = [x for x in df_sect_only.columns if x == 'outcome' or x == 'sample_id'])
norm_ex = normalize(ex)
scale_ex = std_scl.fit_transform(ex)

why = df_sect_only.drop(columns = [x for x in df_sect_only.columns if x != 'outcome'])


In [None]:
df_sect_only['outcome'].value_counts(normalize = True)

In [None]:
# 10X ITERATED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION FOR MOST ROBUST EVAL W/SMALL SAMPLE
rkf = RepeatedKFold(n_splits=3, n_repeats=10)
log = lm.LogisticRegression()
acc_scores = cross_val_score(log, scale_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(log, scale_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
# NOW RANDOM FOREST
rf = ens.RandomForestClassifier()
acc_scores = cross_val_score(rf, ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(rf, ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
lasso = lm.LogisticRegression(penalty = 'l1', solver = 'liblinear')
acc_scores = cross_val_score(lasso, scale_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(lasso, scale_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
for a in np.arange(1, 102, 10):
    rdg = lm.RidgeClassifier(alpha = a)
    acc_scores = cross_val_score(rdg, norm_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
    prec_scores = cross_val_score(rdg, norm_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
    print(a, ' ridge accuracy: ', np.mean(acc_scores))
    print(a, ' ridge precision: ', np.mean(prec_scores))
    


In [None]:
for kern in ['linear', 'poly', 'rbf', 'sigmoid']:
    
    svc = SVC(kernel = kern, probability = True)
    
    acc_scores = cross_val_score(svc, norm_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
    prec_scores = cross_val_score(svc, norm_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
    print(kern, ' accuracy: ', np.mean(acc_scores))
    print(kern, ' precision: ', np.mean(prec_scores))

In [None]:
#TRYING WAYYYYY SLIMMED DOWN VERSION
df_trim = df_all_updated.copy()
df_trim = df_trim.drop(columns = [x for x in df_trim.columns if 'gene_'  in x or\
                                             'med_' in x or 'nty' in x or\
                                              ('rawscore' in x and 'non_neg' not in x) or\
                                 ('section' in x and\
                                  'chrom_11' not in x and\
                                  'chrom_1' not in x and\
                                  'chrom_16' not in x and \
                                  'chrom_6' not in x and \
                                  'chrom_17' not in x and \
                                  'chrom_2' not in x)])

In [None]:
# df_trim.to_csv("trim_stats_by_sample.csv")
df_trim.shape

In [None]:
ex = df_trim.drop(columns = [x for x in df_trim.columns if x == 'outcome' or x == 'sample_id'])
norm_ex = normalize(ex)
scale_ex = std_scl.fit_transform(ex)

why = df_trim.drop(columns = [x for x in df_trim.columns if x != 'outcome'])


In [None]:
df_trim['outcome'].value_counts(normalize = True)

In [None]:
# 10X ITERATED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION FOR MOST ROBUST EVAL W/SMALL SAMPLE
rkf = RepeatedKFold(n_splits=3, n_repeats=10)
log = lm.LogisticRegression()
acc_scores = cross_val_score(log, scale_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(log, scale_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
# NOW RANDOM FOREST
rf = ens.RandomForestClassifier()
acc_scores = cross_val_score(rf, ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(rf, ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
lasso = lm.LogisticRegression(penalty = 'l1', solver = 'liblinear')
acc_scores = cross_val_score(lasso, scale_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(lasso, scale_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
for a in np.arange(1, 102, 10):
    rdg = lm.RidgeClassifier(alpha = a)
    acc_scores = cross_val_score(rdg, norm_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
    prec_scores = cross_val_score(rdg, norm_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
    print(a, ' ridge accuracy: ', np.mean(acc_scores))
    print(a, ' ridge precision: ', np.mean(prec_scores))
    


In [None]:
for kern in ['linear', 'poly', 'rbf', 'sigmoid']:
    
    svc = SVC(kernel = kern, probability = True)
    
    acc_scores = cross_val_score(svc, norm_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
    prec_scores = cross_val_score(svc, norm_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
    print(kern, ' accuracy: ', np.mean(acc_scores))
    print(kern, ' precision: ', np.mean(prec_scores))

In [None]:
#TRYING NOT EVEN SECTION
df_v_trim = df_all.copy()
df_v_trim = df_trim.drop(columns = [x for x in df_trim.columns if 'gene_'  in x or\
                                             'med_' in x or 'nty' in x or\
                                              ('rawscore' in x and 'non_neg' not in x) or\
                                   'section' in x])

In [None]:
df_v_trim.shape

In [None]:
ex = df_v_trim.drop(columns = [x for x in df_v_trim.columns if x == 'outcome' or x == 'sample_id'])
norm_ex = normalize(ex)
scale_ex = std_scl.fit_transform(ex)

why = df_v_trim.drop(columns = [x for x in df_v_trim.columns if x != 'outcome'])


In [None]:
df_v_trim['outcome'].value_counts(normalize = True)

In [None]:
# 10X ITERATED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION FOR MOST ROBUST EVAL W/SMALL SAMPLE
rkf = RepeatedKFold(n_splits=3, n_repeats=10)
log = lm.LogisticRegression()
acc_scores = cross_val_score(log, scale_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(log, scale_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
# NOW RANDOM FOREST
rf = ens.RandomForestClassifier()
acc_scores = cross_val_score(rf, ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(rf, ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
lasso = lm.LogisticRegression(penalty = 'l1', solver = 'liblinear')
acc_scores = cross_val_score(lasso, scale_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(lasso, scale_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
for a in np.arange(1, 102, 10):
    rdg = lm.RidgeClassifier(alpha = a)
    acc_scores = cross_val_score(rdg, norm_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
    prec_scores = cross_val_score(rdg, norm_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
    print(a, ' ridge accuracy: ', np.mean(acc_scores))
    print(a, ' ridge precision: ', np.mean(prec_scores))
    


In [None]:
for kern in ['linear', 'poly', 'rbf', 'sigmoid']:
    
    svc = SVC(kernel = kern, probability = True)
    
    acc_scores = cross_val_score(svc, norm_ex, why.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
    prec_scores = cross_val_score(svc, norm_ex, why.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
    print(kern, ' accuracy: ', np.mean(acc_scores))
    print(kern, ' precision: ', np.mean(prec_scores))

In [None]:
# RF W/NO SECTION CONSISTENTLY BEST
# NEED TO FIGURE OUT SOME VERSION OF FEATURE SELECTION THAT DOESN'T RULE OUT CROSS VAL
# FOR NOW, GOING TO COMPARE 10K ITERATED FI VS PLAIN FEATURE SELECTION

In [None]:
ex = df_sect_only.drop(columns = [x for x in df_all.columns if x == 'outcome' or x == 'sample_id'])
norm_ex = normalize(ex)
scale_ex = std_scl.fit_transform(ex)

why = df_all.drop(columns = [x for x in df_all.columns if x != 'outcome'])


In [None]:

ex = df_sect_only.drop(columns = [x for x in df_sect_only.columns if x == 'outcome' or x == 'sample_id'])
norm_ex = normalize(ex)
scale_ex = std_scl.fit_transform(ex)

why = df_sect_only.drop(columns = [x for x in df_sect_only.columns if x != 'outcome'])
final_df = pd.DataFrame()
for i in range(10000):
    print(i)
    ex_train, ex_test, why_train, why_test = train_test_split(ex, why, test_size = .25)

    ada = ens.AdaBoostClassifier().fit(ex_train, why_train.values.ravel())
#     rf = ens.RandomForestClassifier().fit(ex_train, why_train.values.ravel())
    
    feature_names = [x for x in df_sect_only.columns if x != 'outcome' and x != 'sample_id']
#     importances = rf.feature_importances_
    importances = ada.feature_importances_
    data = {'feature_names': feature_names, 'feature_importance': importances}
    fi_df = pd.DataFrame(data)
    
    final_df = pd.concat([final_df, fi_df])
final_df.to_csv("ada_fi.csv")



In [None]:
keep_df = final_df.copy()

keep_df = keep_df.groupby('feature_names').sum().reset_index()
keep_df.sort_values(by = ['feature_importance'], ascending = False, inplace = True)
keep_df = keep_df.head(25)

# keep_df = keep_df[keep_df['feature_importance'] >= 23]

keep_df.to_csv("ada_fi.csv")

# keep_df = keep_df.head(25)

# keep_df.to_csv("rf_fi_df_all_top_25.csv")

#DEFINE SIZE OF BAR PLOT
# keep_df = pd.read_csv("ada_fi.csv")
# keep_df = keep_df.head(25)
# keep_df.to_csv("ada_fi.csv")
plt.figure(figsize = (5, 7))
keep_df['feature_importance'] /= 10000
keep_df['type'] = keep_df['feature_names'].apply(lambda x: 'CADD Score' if 'non_neg_rawscore' in x else \
                                                 ('Number of Mutations' if 'mutations' in x else 'Clinical'))

keep_df['feature_names'] = keep_df['feature_names'].apply(lambda x: re.sub('chrom', 'Chrom.', 
                                                            re.sub('chromosome', 'Chromosome', 
                                                            re.sub('section', 'Mbp', 
                                                            re.sub('_', ' ',
                                                            re.sub('_mutations', '', 
                                                            re.sub('_non_neg_rawscore', '', x)))))))

#PLOT SEABORN BAR CHART
sns.barplot(x = keep_df['feature_importance'], y = keep_df['feature_names'], hue = keep_df['type'], ci = None)

#ADD CHART LABELS
plt.title('Adaboost Feature Importance')
plt.legend(title = "Variable Type")
plt.xlabel('Average Feature Importance (10k Iterations)')
plt.ylabel('Feature Names')
plt.savefig("adaboost_fi", bbox_inches = "tight")