#### Librairies et lecture des jeux de données

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import argparse
from torch.serialization import save
from transformers_interpret import SequenceClassificationExplainer
import matplotlib.pyplot as plt

from kmembert.models import HealthBERT
from kmembert.utils import create_session

In [None]:
# Import argparse
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--data_folder", type=str, default="data/ehr/test.csv", 
    help="data path to access to the testing file")
parser.add_argument("-p", "--path_dataset", type=str, default="data/ehr/test.csv", 
    help="data path to access to the testing file")
parser.add_argument("-r", "--resume", type=str, default="kmembert-base", 
    help="result folder in with the saved checkpoint will be reused")
parser.add_argument("-nr", "--nrows", type=int, default=10, 
    help="maximum number of samples for testing")
parser.add_argument("-f", "--folder_to_save", type=str, default="graphs", 
    help="folder to save the figures")
parser.add_argument("-ng", "--noigr", type=int, default=2, 
    help="The Noigr of a patient")
args = parser.parse_args("")

In [None]:
# Read df used to test
# Creation du jeu de donnees pour retrouver les infos interessantes
test = pd.read_csv("kmembert\\Visualization\\test_VM.csv")

# Read df with prediction
res_pred = pd.read_csv("kmembert\\Visualization\\results_pred_VM.csv")

# Merge 2 dataframes
resul_df = pd.merge(test, res_pred, left_on="indice", right_on="ind")[['Noigr', 'Date cr', 'Date deces', 'Texte', 'indice', 'pred', 'lab']]

# Add columns to have absolute error between pred and lab
resul_df['ecart'] = abs(resul_df['pred']-resul_df['lab'])

resul_df['nb_words'] = resul_df['Texte'].apply(lambda x: len(x.split()))

resul_df['nb_docs'] = resul_df.groupby(["Noigr"])["Noigr"].transform("count")

resul_df = resul_df.sort_values("ecart", ascending=False)

#### Modification du jeu de données

In [None]:
# Add PS_FLAG_0... to show which PS it is
resul_df['PS_flag_4'] = resul_df['Texte'].apply(lambda x: "PS = 4" in x)
resul_df['PS_flag_3'] = resul_df['Texte'].apply(lambda x: "PS = 3" in x)
resul_df['PS_flag_2'] = resul_df['Texte'].apply(lambda x: "PS = 2" in x)
resul_df['PS_flag_1'] = resul_df['Texte'].apply(lambda x: "PS = 1" in x)
resul_df['PS_flag_0'] = resul_df['Texte'].apply(lambda x: "PS = 0" in x)

In [None]:
# Add PS_flag to show which PS it is
# create a list of our conditions
conditions = [
    (resul_df['PS_flag_0'] == True),
    (resul_df['PS_flag_1'] == True),
    (resul_df['PS_flag_2'] == True),
    (resul_df['PS_flag_3'] == True),
    (resul_df['PS_flag_4'] == True),
    ]

# create a list of the values we want to assign for each condition
values = ['PS=0', 'PS=1', 'PS=2', 'PS=3', 'PS=4']

resul_df['PS_flag'] = np.select(conditions, values)
resul_df_PS = resul_df[resul_df.PS_flag != '0']

# Number of occurences per type
resul_df_PS['PS_count'] = resul_df_PS.groupby(["PS_flag"])["PS_flag"].transform("count")

resul_df_PS.head(2)

#### Test statistique : Kruskal

In [None]:
# Test Statistique sur les différences de temps de survie des patients : Valeures observees
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

a = resul_df_PS[resul_df_PS.PS_flag=='PS=0']['lab']
b = resul_df_PS[resul_df_PS.PS_flag=='PS=1']['lab']
c = resul_df_PS[resul_df_PS.PS_flag=='PS=2']['lab']
d = resul_df_PS[resul_df_PS.PS_flag=='PS=3']['lab']
e = resul_df_PS[resul_df_PS.PS_flag=='PS=4']['lab']
print("Kruskal, pvalue: ", stats.kruskal(a, b, c, d, e))
print("Anova, pvalue: ", stats.f_oneway(a, b, c, e, e))

In [None]:
# Test Statistique sur les différences de temps de survie des patients : Valeures predites
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

a = resul_df_PS[resul_df_PS.PS_flag=='PS=0']['pred']
b = resul_df_PS[resul_df_PS.PS_flag=='PS=1']['pred']
c = resul_df_PS[resul_df_PS.PS_flag=='PS=2']['pred']
d = resul_df_PS[resul_df_PS.PS_flag=='PS=3']['pred']
e = resul_df_PS[resul_df_PS.PS_flag=='PS=4']['pred']
print("Kruskal, pvalue: ", stats.kruskal(a, b, c, d, e))
print("Anova, pvalue: ", stats.f_oneway(a, b, c, e, e))

#### BoxPlot

In [None]:
# BoxPlot - Survival Time per PS - Observed Values
sns.boxplot(x='PS_flag', y='lab', data=resul_df_PS, order=['PS=0', 'PS=1', 'PS=2', 'PS=3', 'PS=4'], palette='rocket_r')
plt.xlabel("Performance Status Value")
plt.ylabel("Label")
plt.title("BoxPlot - Survival label per Performance Status")

In [None]:
# BoxPlot - Survival Time per PS - Prediction
sns.boxplot(x='PS_flag', y='pred', data=resul_df_PS, order=['PS=0', 'PS=1', 'PS=2', 'PS=3', 'PS=4'], palette='rocket_r')
plt.xlabel("Performance Status Value")
plt.ylabel("Predicted probability")
plt.title("BoxPlot - Survival prediction per Performance Status")

In [None]:
# BoxPlot - Survival Time per PS - Prediction and Observed Label
fig = plt.figure()
ax = plt.subplot(111)

# Ajustement du jeu de données
df_plot = resul_df_PS.melt(id_vars='PS_flag', value_vars=["lab", "pred"])
df_plot['variable'] = df_plot['variable'].replace('lab', 'Label')
df_plot['variable'] = df_plot['variable'].replace('pred', 'Prediction')

# Plot
sns.boxplot(x='PS_flag', y='value', hue='variable', data=df_plot, 
            order=['PS=0', 'PS=1', 'PS=2', 'PS=3', 'PS=4'], palette=['peachpuff', 'lightcoral'])#'rocket_r')
plt.xlabel("Performance Status Value")
plt.ylabel("Probability")
plt.title("BoxPlot - Survival probability per Performance Status")

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.225),
          fancybox=True, shadow=True, ncol=5)
ax.xaxis.set_label_coords(0.5, -0.19)
# Plot text
plt.text(-0.175, -0.205, '2549', fontsize = 10, fontweight='bold', size=8, color='black', bbox=dict(facecolor='moccasin'))
plt.text(-0.175+1, -0.205, '5081', fontsize = 10, fontweight='bold', size=8, color='black', bbox=dict(facecolor='moccasin'))
plt.text(-0.175+2, -0.205, '1238', fontsize = 10, fontweight='bold', size=8, color='black', bbox=dict(facecolor='moccasin'))
plt.text(-0.175+3.05, -0.205, '308', fontsize = 10, fontweight='bold', size=8, color='black', bbox=dict(facecolor='moccasin'))
plt.text(-0.175+4.1, -0.205, '85', fontsize = 10, fontweight='bold', size=8, color='black', bbox=dict(facecolor='moccasin'))

#### Création du jeu de données

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

def strDate_to_days(row, date_format = "%Y-%m-%d"):
    '''
        When there's no FLAG_DECES column in the df
    '''
    a = datetime.strptime(row['Date deces'], date_format)
    b = datetime.strptime(row['Date cr'], date_format)
    val = (a-b).days
    return val

def changeFormat(df: pd.DataFrame, flag_GR: bool):
    if(flag_GR):
        df = df[0].str.split(',', expand=True)
        df = df.iloc[1:,[0,2,3]]
        df.columns = ["Noigr", "Date deces", "Date cr"]

    df['Date cr'] = df['Date cr'].apply(lambda x: str(x) )
    df['Date cr'] = df['Date cr'].apply(lambda x: '-'.join([x[:4], x[4:6], x[6:]]))
    df['Date deces'] = df['Date deces'].apply(lambda x: str(x) )
    df['Date deces'] = df['Date deces'].apply(lambda x: '-'.join([x[:4], x[4:6], x[6:]]))
    df['survival_time'] = df.apply(strDate_to_days, axis=1)

    return df

In [9]:
# Read 10 dataframe
result = pd.read_csv("doc_toTest\\test_rs0.csv")

for i in range(1, 10):
    # Read the data
    file_name = "doc_toTest\\test_rs"+str(i)+".csv"
    df_read = pd.read_csv(file_name)
    # Concatenate to have all rows
    frames = [result, df_read]
    result = pd.concat(frames)

# Remove duplicates
result = result.drop_duplicates()

In [10]:
# Filter on text removing Karnofsky and PS information
result['PS_KAR_flag'] = result['Texte'].apply(lambda x: ("PS =" in x) | ("Karnofsky =" in x) )
result = result[result.PS_KAR_flag==True]

result = result.reset_index(drop=True)

# Add PS_FLAG_0... to show which PS it is
result['PS_flag_4'] = result['Texte'].apply(lambda x: "PS = 4" in x)
result['PS_flag_3'] = result['Texte'].apply(lambda x: "PS = 3" in x)
result['PS_flag_2'] = result['Texte'].apply(lambda x: "PS = 2" in x)
result['PS_flag_1'] = result['Texte'].apply(lambda x: "PS = 1" in x)
result['PS_flag_0'] = result['Texte'].apply(lambda x: "PS = 0" in x)

# Add PS_flag to show which PS it is
# create a list of our conditions
conditions = [
    (result['PS_flag_0'] == True),
    (result['PS_flag_1'] == True),
    (result['PS_flag_2'] == True),
    (result['PS_flag_3'] == True),
    (result['PS_flag_4'] == True),
    ]

# create a list of the values we want to assign for each condition
values = ['PS=0', 'PS=1', 'PS=2', 'PS=3', 'PS=4']

result['PS_flag'] = np.select(conditions, values)
resul_df_PS = result[result.PS_flag != '0']

# Remove Karnosfky and PS of the text if needed
'''resul_df_PS['Texte'] = resul_df_PS['Texte'].str.replace('Karnofsky.*?PS = \d.', '')
resul_df_PS['Texte'] = resul_df_PS['Texte'].str.replace('PS = \d', '')'''

# Add index
resul_df_PS['indice'] = list(range(len(resul_df_PS)))

# Number of occurences per type
resul_df_PS['PS_count'] = resul_df_PS.groupby(["PS_flag"])["PS_flag"].transform("count")

resul_df_PS = resul_df_PS[['Noigr', 'Date cr', 'Date deces', 'Texte', 'PS_flag', 'indice']]

resul_df_PS.sort_values("indice").head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Noigr,Date cr,Date deces,Texte,PS_flag,indice
0,6035881,20041004,20120922,"Karnofsky = 100 %, PS = 0. Evolutivité de la m...",PS=0,0
1,6035881,20051026,20120922,"Karnofsky = 100 %, PS = 0. Se plaint depuis la...",PS=0,1
2,6035881,20051121,20120922,"Karnofsky = 100 %, PS = 0. Evolutivité de la m...",PS=0,2


In [11]:
resul_df_PS.to_csv("test_T2_withPS.csv", index=False)