In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import scale, StandardScaler,FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import roc_curve, auc,confusion_matrix,roc_auc_score, precision_recall_curve
from sklearn.cross_decomposition import PLSRegression

import random
from tqdm import tqdm

SEED = 520

In [2]:
df = pd.read_csv('raw_data.csv', sep=';')

In [3]:
col_spectre = df.columns[4:-1]
other_col = ["patient_name",'cell_name','cell_type','patient_state','spectre']

# Preprocess the data
df['cell_type'] = df['cell_type'].replace('B',1)
df['cell_type'] = df['cell_type'].replace('TNK',0)

# One hot the specter number feature
for i in df['spectre'].unique():
    df['spectre_'+str(int(i))] = np.where(df['spectre'] == i, 1, 0)

In [69]:
def apply_dimensionality_reduction(x_train, x_test):
    pls = PLSRegression(n_components=20)

    spectre_transformed_train = pd.DataFrame(pls.fit_transform(x_train[col_spectre], y_train)[0])
    spectre_transformed_test = pd.DataFrame(pls.transform(x_test[col_spectre]))

    col_spectre_pls = spectre_transformed_train.columns

    x_train = pd.concat([x_train.iloc[:,-3:].reset_index(), spectre_transformed_train], axis=1).drop("index", axis=1)
    x_test = pd.concat([x_test.iloc[:,-3:].reset_index(), spectre_transformed_test], axis=1).drop("index", axis=1)
    
    return x_train, x_test

def scale_feature(x_train, x_test):
    # We scale continous feature
    
    col_spectre_pls = x_train.columns[3:]
    columnTransformer = ColumnTransformer([['scaler',
                                            StandardScaler(),
                                            col_spectre_pls]],
                                          remainder='passthrough',
                                          )

    x_train_transformed = np.array(columnTransformer.fit_transform(x_train))
    x_test_transformed = np.array(columnTransformer.transform(x_test))
    return x_train_transformed, x_test_transformed

def compute_metrics(test_df, verbose=False):
    cell_test = test_df.groupby(['patient_name','cell_name']).agg({'cell_type':"first",
                                                              'patient_state':"first",
                                                              "spectre":'count',
                                                              'prediction':"mean"})
    # We one hot encode the label
    cell_test['TNK'] = np.where(cell_test['cell_type']==0, 1, 0)
    cell_test['B'] = np.where(cell_test['cell_type']==1, 1, 0)

    # Using a threshold of 0.5, we assign label to the mean prediction of all specters for one cell

    cell_test['predicted_type'] = np.where(cell_test['prediction'] < 0.4, 0, 1)

    # We one hot encode the predicted label
    cell_test['TNK_predicted'] = np.where(cell_test['predicted_type'] == 0, 1, 0)
    cell_test['B_predicted'] = np.where(cell_test['predicted_type'] == 1, 1, 0)

    # We aggregate our test set at the patient level to get the prection for each patient

    patient_df = cell_test.groupby('patient_name').agg({'TNK':sum, 
                                                      'B':sum,
                                                      'TNK_predicted':sum,
                                                      'B_predicted':sum,
                                                      'patient_state':"first", 
                                                      'spectre':'sum'})

    patient_df['cell_number'] = patient_df['TNK'] + patient_df['B']
    patient_df['ratio_B'] = patient_df['B']/patient_df['cell_number']
    patient_df['ratio_B_predicted'] = patient_df['B_predicted']/patient_df['cell_number']

    patient_df['diff_ratio'] = abs(patient_df['ratio_B_predicted'] - patient_df['ratio_B'])
    patient_df['predicted_state'] = np.where(patient_df['ratio_B_predicted'] < 0.4, "sain", 'malade')
    patient_df.sort_values("ratio_B", ascending=False)
    
    # Error rate for healthy and sick patient
    errors = [list(patient_df['diff_ratio'][patient_df['patient_state'] == 'malade']), 
              list(patient_df['diff_ratio'][patient_df['patient_state'] == 'sain'])]
    
    accuracy = len(patient_df[patient_df['predicted_state'] == patient_df['patient_state']]) / len(patient_df)
    
    if verbose==True:

        print('Moyenne de l\'erreur', patient_df['diff_ratio'].mean())
        print('Variance de l\'erreur',patient_df['diff_ratio'].var())
        print('Patients malades bien prédit', len(patient_df[(patient_df.predicted_state == patient_df.patient_state) \
                                                             & (patient_df.patient_state == "malade")]),'/',
                                              len(patient_df[patient_df.patient_state == 'malade']))

        print('Patients sains bien prédit', len(patient_df[(patient_df.predicted_state == patient_df.patient_state) \
                                                             & (patient_df.patient_state == "sain")]),'/',
                                              len(patient_df[patient_df.patient_state == 'sain']))
                                          
    return errors, accuracy

In [None]:
patient_malade = list(df[df['patient_state'] == 'malade'].patient_name.unique())
patient_sain = list(df[df['patient_state'] == 'sain'].patient_name.unique())

n_partitions = []

n_models = []

N_PARTITIONS = 20
N_PATIENTS_TEST = 2

for i in range(N_PARTITIONS):
    patient_test = random.sample(patient_malade, N_PATIENTS_TEST) + random.sample(patient_sain, N_PATIENTS_TEST)
    n_partitions.append(patient_test)
    n_models.append(RandomForestClassifier(max_depth=15,n_estimators=50))

mean_errors_sick = []
mean_errors_healthy = []
accuracy = []

# Main loop
for i, partitions in enumerate(n_partitions):
    
    print('Training partition {}...'.format(i))
    
    x_train = df.loc[~df.patient_name.isin(partitions)]
    y_train = df.loc[~df.patient_name.isin(partitions)]['cell_type']
    x_test = df.loc[df.patient_name.isin(partitions)]
    y_test = df.loc[df.patient_name.isin(partitions)]['cell_type']
    x_train = x_train.drop(['cell_type','cell_name','patient_name','patient_state','spectre'], axis=1)
    x_test = x_test.drop(['cell_type','cell_name','patient_name','patient_state','spectre'], axis=1)
    
    # Apply a Partial least squares regression to reduce the dimensionality
    x_train, x_test = apply_dimensionality_reduction(x_train, x_test)
    x_train, x_test = scale_feature(x_train, x_test)
    
    # We retrain our model on the whole train set using the best hyperparameters
    n_models[i].fit(x_train, y_train.values.ravel())
    
    # Show confusion matrix
    y_result = n_models[i].predict_proba(x_test)[:,1]
    y_result = np.where(y_result < 0.4, 0, 1)
    
    # We select all the patients that was not used for training or validation
    test_set = df[df.patient_name.isin(partitions)]
    test_set = test_set.assign(prediction = clf.predict_proba(x_test)[:, 1])
    
    metrics = compute_metrics(test_set)
    mean_errors_sick += metrics[0][0]
    mean_errors_healthy += metrics[0][1]
    accuracy.append(metrics[1])

Training partition 0...
Training partition 1...
Training partition 2...
Training partition 3...
Training partition 4...
Training partition 5...
Training partition 6...
Training partition 7...
Training partition 8...
Training partition 9...
Training partition 10...
Training partition 11...
Training partition 12...


In [75]:
np.mean(mean_errors_sick)

0.1276757586562666

In [76]:
np.mean(mean_errors_healthy)

0.18550728161968896

In [77]:
np.mean(accuracy)

0.8125