In [1]:
import pandas as pd
import numpy as np
import pylab as plt
from IPython import display
import seaborn as sns
sns.set()
import matplotlib
from pylab import rcParams
import itertools 
import time 
from random import seed
from random import random
from matplotlib import pyplot
# 
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, auc
#from skimage.io import imread
from PIL import Image

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly.subplots import make_subplots

#Reading data, calendar and sell_prices
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')
train.head(3)

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0


## Patient Id Counts

### Extract gender per patient

In [2]:
def create_patient_df(train,test):
    #Create a series with patient_id as index and gender values for train and test
    patient_gender_train = train.groupby("patient_id").sex.unique().apply(lambda l: l[0])
    patient_gender_test = test.groupby("patient_id").sex.unique().apply(lambda l: l[0])
    #Convert series into train_patients and test_patients dataframes 
    train_patients = pd.DataFrame(index=patient_gender_train.index.values, 
                                  data=patient_gender_train.values, columns=["sex"])
    test_patients = pd.DataFrame(index=patient_gender_test.index.values, 
                                 data=patient_gender_test.values, columns=["sex"])
    #Extract nun_images with proper groupby
    train_patients["num_images"] = train.groupby("patient_id").size()
    test_patients["num_images"] = test.groupby("patient_id").size()
    #Extract age_span for patient with multiple images
    train_patients["min_age"] = train.groupby("patient_id").age_approx.min()
    train_patients["max_age"] = train.groupby("patient_id").age_approx.max()
    test_patients["min_age"] = test.groupby("patient_id").age_approx.min()
    test_patients["max_age"] = test.groupby("patient_id").age_approx.max()
    train_patients["age_span"] = train_patients["max_age"] - train_patients["min_age"]
    test_patients["age_span"] = test_patients["max_age"] - test_patients["min_age"]
    #Extract benign_cases  and min and max aged for malignant
    train_patients["benign_cases"] = train.groupby(["patient_id", "benign_malignant"]).size().loc[:, "benign"]
    train_patients["malignant_cases"] = train.groupby(["patient_id", "benign_malignant"]).size().loc[:, "malignant"]
    train_patients["min_age_malignant"] = train.groupby(["patient_id", "benign_malignant"]).age_approx.min().loc[:, "malignant"]
    train_patients["max_age_malignant"] = train.groupby(["patient_id", "benign_malignant"]).age_approx.max().loc[:, "malignant"]
    return train_patients, test_patients
train_patients, test_patients = create_patient_df(train,test)

In [3]:
print("train_patients shape:",train_patients.shape)
print('Malignant cases count train:')
print(train_patients.malignant_cases.notnull().value_counts())
train_patients.sort_values(by="malignant_cases", ascending=False).head()

train_patients shape: (2056, 9)
Malignant cases count train:
False    1628
True      428
Name: malignant_cases, dtype: int64


Unnamed: 0,sex,num_images,min_age,max_age,age_span,benign_cases,malignant_cases,min_age_malignant,max_age_malignant
IP_7373371,female,42,55.0,60.0,5.0,34.0,8.0,55.0,60.0
IP_9111321,male,13,60.0,60.0,0.0,7.0,6.0,60.0,60.0
IP_2412574,male,23,65.0,70.0,5.0,18.0,5.0,65.0,70.0
IP_3237442,male,8,65.0,85.0,20.0,3.0,5.0,65.0,80.0
IP_9037179,male,7,70.0,75.0,5.0,3.0,4.0,70.0,70.0


### Categorical variables

### Preprocess categorical variables 
The `anatom_site_general_challenge` in the train set has a lot of  NaNs, we will treat the as as Unknown category since in addition this happens as weel in the test set. For the `sex` category we only have a small number of NaNs and only two categories, for that reason we will replace it with the mode. Is this the best approach ??

Use the next code cell to label encode the data in `X_train` and `X_valid`.  Set the preprocessed DataFrames to `label_X_train` and `label_X_valid`, respectively.  
- We have provided code below to drop the categorical columns in `bad_label_cols` from the dataset. 
- You should label encode the categorical columns in `good_label_cols`.

In [25]:
X_train.index

Int64Index([22826, 20292, 29431, 15395,  2333, 23787, 15895, 17271, 29285,
            17599,
            ...
             2433,   769,  1685, 16023, 21962, 16850,  6265, 11284,   860,
            15795],
           dtype='int64', length=26500)

In [27]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
cols_to_drop = ['image_name','diagnosis', 'benign_malignant','target']
y = train.target
X = train.drop(cols_to_drop, axis=1)

#Missing values Categorcial values
# Fill NaN with Unknown on anatom_site_general_challenge
X['anatom_site_general_challenge'].fillna("Unknown", inplace=True)
# Fill NaN with mode on 'sex'
mode = X['sex'].mode()[0]
X['sex'].fillna(mode, inplace=True)

#Add patient df to train set
train_patients_cols_add=["num_images","min_age","max_age","age_span"]
X=X.set_index('patient_id').join(train_patients[train_patients_cols_add]).reset_index()
X = X.rename(columns={'index': 'patient_id'})
#Add patient test set
X_test=test.set_index('patient_id').join(test_patients[train_patients_cols_add]).reset_index()
X_test = X_test.rename(columns={'index': 'patient_id'})


# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=42)
# All categorical columns
object_cols = ['patient_id', 'sex', 'anatom_site_general_challenge']

# Imputation Numerical features 
# Get names of columns with missing values
cols_with_missing = ['age_approx', 'min_age','max_age','age_span']
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train[cols_with_missing]), index=X_train.index)
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid[cols_with_missing]), index=X_valid.index)
# Imputation removed column names; put them back
imputed_X_train.columns = X_train[cols_with_missing].columns
imputed_X_valid.columns = X_valid[cols_with_missing].columns

# Label encode
# Columns that can be safely label encoded and Problematic columns
good_label_cols = ['sex', 'anatom_site_general_challenge']
bad_label_cols = ['patient_id', 'age_approx', 'min_age','max_age','age_span','num_images']
# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)
# Apply label encoder to bad_label_cols 
label_encoder = LabelEncoder()
for col in good_label_cols:
    label_X_train[col] = label_encoder.fit_transform(label_X_train[col])
    label_X_valid[col] = label_encoder.transform(label_X_valid[col])
    
label_X_train.head(3)

Unnamed: 0,sex,anatom_site_general_challenge
22826,0,5
20292,1,2
29431,1,1


In [28]:
imputed_X_train.head(3)

Unnamed: 0,age_approx,min_age,max_age,age_span
22826,35.0,35.0,35.0,0.0
20292,35.0,30.0,35.0,5.0
29431,45.0,45.0,45.0,0.0


In [29]:
X_train.head(3)

Unnamed: 0,patient_id,sex,age_approx,anatom_site_general_challenge,num_images,min_age,max_age,age_span
22826,IP_7121757,female,35.0,torso,54,35.0,35.0,0.0
20292,IP_6323321,male,35.0,lower extremity,30,30.0,35.0,5.0
29431,IP_8988837,male,45.0,head/neck,26,45.0,45.0,0.0


## Generate Baseline data

In [42]:
cols_to_keep = ['num_images']

#add imputed
baseline_train = X_train[cols_to_keep].join(imputed_X_train)
baseline_valid = X_valid[cols_to_keep].join(imputed_X_valid)
#add categorical label
baseline_train = baseline_train.join(label_X_train)
baseline_valid = baseline_valid.join(label_X_valid)
baseline_train.head()

Unnamed: 0,num_images,age_approx,min_age,max_age,age_span,sex,anatom_site_general_challenge
22826,54,35.0,35.0,35.0,0.0,0,5
20292,30,35.0,30.0,35.0,5.0,1,2
29431,26,45.0,45.0,45.0,0.0,1,1
15395,52,30.0,30.0,30.0,0.0,0,5
2333,58,80.0,80.0,80.0,0.0,1,5


## Define MAE metric

In [45]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=20, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return metrics.roc_auc_score(y_valid, preds)

## Simple Imputer  
### Numerical Variables

In [46]:
print("MAE from Approach 2 (Imputation):")
print(score_dataset(baseline_train, baseline_valid, y_train, y_valid))
baseline_train.head()

MAE from Approach 2 (Imputation):
0.4822059759819096


Unnamed: 0,num_images,age_approx,min_age,max_age,age_span,sex,anatom_site_general_challenge
22826,54,35.0,35.0,35.0,0.0,0,5
20292,30,35.0,30.0,35.0,5.0,1,2
29431,26,45.0,45.0,45.0,0.0,1,1
15395,52,30.0,30.0,30.0,0.0,0,5
2333,58,80.0,80.0,80.0,0.0,1,5


Note the indexes are not the same after processing, use `df.join(other, lsuffix='_caller', rsuffix='_other')`

Add imputed columns to `X_train` 

In [8]:
col = 'age_approx'
X_train.loc[:, col+'_impute']=imputed_X_train[col].values.copy()
X_train.head(3)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,patient_id,sex,age_approx,anatom_site_general_challenge,age_approx_impute
8762,IP_5205991,male,,lower extremity,48.833951
4043,IP_5205991,male,,upper extremity,48.833951
11963,IP_5205991,male,,head/neck,48.833951
18968,IP_5205991,male,,head/neck,48.833951
17521,IP_5205991,male,,upper extremity,48.833951


Add label encoded images to `X_train`

In [9]:
for col in good_label_cols:
    X_train.loc[:, col+'_label']=label_X_train[col].values.copy()
X_train.head(3)    

Unnamed: 0,patient_id,sex,age_approx,anatom_site_general_challenge,age_approx_impute,sex_label,anatom_site_general_challenge_label
22826,IP_1800426,female,40.0,lower extremity,40.0,0,2
20292,IP_7241377,female,60.0,torso,60.0,0,5
29431,IP_3564160,female,65.0,upper extremity,65.0,0,6
15395,IP_9955501,male,65.0,torso,65.0,1,5
2333,IP_4208266,female,30.0,torso,30.0,0,5


In [10]:
train_patients_cols_add=["num_images","min_age","max_age","age_span"]
X_train=X_train.set_index('patient_id').join(train_patients[train_patients_cols_add])

In [12]:
X_train.reset_index().select_dtypes(exclude=['object'])

Unnamed: 0,age_approx,age_approx_impute,sex_label,anatom_site_general_challenge_label,num_images,min_age,max_age,age_span
0,50.0,50.0,0,3,6,50.0,50.0,0.0
1,50.0,50.0,0,1,6,50.0,50.0,0.0
2,50.0,50.0,0,6,6,50.0,50.0,0.0
3,50.0,50.0,0,5,6,50.0,50.0,0.0
4,50.0,50.0,0,5,6,50.0,50.0,0.0
...,...,...,...,...,...,...,...,...
26495,75.0,75.0,0,5,11,75.0,80.0,5.0
26496,80.0,80.0,0,5,11,75.0,80.0,5.0
26497,75.0,75.0,0,5,11,75.0,80.0,5.0
26498,75.0,75.0,0,6,11,75.0,80.0,5.0


In [42]:
X=X.set_index('patient_id').join(train_patients[train_patients_cols_add]).reset_index()
X = X.rename(columns={'index': 'patient_id'})
X.head()

Unnamed: 0,patient_id,sex,age_approx,anatom_site_general_challenge,num_images,min_age,max_age,age_span
0,IP_0001230,female,50.0,oral/genital,6,50.0,50.0,0.0
1,IP_0001230,female,50.0,lower extremity,6,50.0,50.0,0.0
2,IP_0001230,female,50.0,torso,6,50.0,50.0,0.0
3,IP_0001230,female,50.0,upper extremity,6,50.0,50.0,0.0
4,IP_0001230,female,50.0,torso,6,50.0,50.0,0.0


In [44]:
X.age_span.min()

0.0

In [45]:
X[X['age_approx'].isnull() == True]

Unnamed: 0,patient_id,sex,age_approx,anatom_site_general_challenge,num_images,min_age,max_age,age_span
1377,IP_0550106,female,,torso,3,,,
1378,IP_0550106,female,,head/neck,3,,,
1379,IP_0550106,female,,head/neck,3,,,
16806,IP_5205991,male,,lower extremity,48,,,
16807,IP_5205991,male,,upper extremity,48,,,
...,...,...,...,...,...,...,...,...
32557,IP_9835712,male,,torso,17,,,
32558,IP_9835712,male,,upper extremity,17,,,
32559,IP_9835712,male,,lower extremity,17,,,
32560,IP_9835712,male,,torso,17,,,


## Summary 

1. Too much data, what to do ?? Cloud, Kaggle. Encode images in cloud like env and then use encodings locally. Ignore images for the mean time and include them later on. 
2. Binary classifier
3. `image_name, patient_id, sex, age_approx, anatom_site_general_challenge, diagnosis, benign_malignant, target` Categorical variables ?? 
4. PCA o tSNE 
4. Ideas: Use ResNet to encode images, other options Inception, VGG, CV2 ?? Which??
5. Note that test set DO NOT HAVE: `diagnosis, benign_malignant, target`  and the task is to predict ` target`
6. Is there any other relevant information in the DICOM or tfrecords available in the data set. 
7. What should we use for a cost function
8. Review Notebooks on Kaggle, see below.  
9. How to work with unbalance datasets, 
10. Benchamark XGBoost... 


Reference 
* [Kaggel notebook](https://www.kaggle.com/allunia/don-t-turn-into-a-smoothie-after-the-shake-up)

## Strategy

1. Run model with only the metadata, XGBoost, logistic regresion(benchmark), categorical variables
2. f1 score, note that 
3. Feature importance -- XGBoost, shapely values
66. Encoding de las imagenes --  mole size ?????  
3. Preprocessing (mole shape) CV2 ("normalise"), 
4. Mid July EDA 
5. Agosto Run-Run-Run

## Ideas con Carlos
1. Usar features de smoothie -- Numero de imagenes por paciente, sacar mas features, benign/malign(no esta train set), 
2. Mirar imagenes: Data augmentation (positives), contornos, y color --- standarizar.
3. StratifiesKFold para solucionar low positive stats
4. See categorical variables, oridinal or not. 

# Dependencies

In [3]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.10.1-py3-none-any.whl (215 kB)
[K     |████████████████████████████████| 215 kB 374 kB/s eta 0:00:01
Installing collected packages: seaborn
Successfully installed seaborn-0.10.1


In [59]:

# Python3 code to Check for  
# balanced parentheses in an expression 
open_list = ["[","{","("] 
close_list = ["]","}",")"] 
  
# Function to check parentheses 
def balanced(myStr): 
    stack = [] 
    for i in myStr: 
        if i in open_list: 
            stack.append(i) 
        elif i in close_list: 
            pos = close_list.index(i) 
            if ((len(stack) > 0) and
                (open_list[pos] == stack[len(stack)-1])): 
                stack.pop() 
            else: 
                return "Unbalanced"
    if len(stack) == 0: 
        return "Balanced"
    else: 
        return "Unbalanced"
  
  
# Driver code 
string = "{[]{()}}"
print(string,"-", balanced(string)) 
  
string = "[{}{})(]"
print(string,"-", balanced(string)) 
  
string = "((()"
print(string,"-",balanced(string)) 

string ="(a[0]+b[2c[6]]){24+53}"
print(string,"-",balanced(string)) 

{[]{()}} - Balanced
[{}{})(] - Unbalanced
((() - Unbalanced
(a[0]+b[2c[6]]){24+53} - Balanced


In [61]:
string1 = '(a[0]+b[2c[6]]){24+53}'
print(string1,"-",balanced(string1)) 
string2 = "f(e(d))"
print(string2,"-",balanced(string2))
string3 = "[()]{}([])"
print(string3,"-",balanced(string3)) 
string4 = "((b)"
print(string4,"-",balanced(string4)) 
string5 = "(c]"
print(string5,"-",balanced(string5)) 
string6 = "{(a[])"
print(string6,"-",balanced(string6)) 
string7 = "([)]"
print(string7,"-",balanced(string7)) 
string8 = ")("
print(string8,"-",balanced(string8)) 
string9 = ""
print(string9,"-",balanced(string9)) 

(a[0]+b[2c[6]]){24+53} - Balanced
f(e(d)) - Balanced
[()]{}([]) - Balanced
((b) - Unbalanced
(c] - Unbalanced
{(a[]) - Unbalanced
([)] - Unbalanced
)( - Unbalanced
 - Balanced
