## Table of Contents
* [1. Importing libraries](#chapter1)
* [2. Importing data](#chapter2)
* [3. Exploring dataset](#chapter3)
* [4. Exploring Y](#chapter4)
* [5. Dummy coding](#chapter5)


## 1. importing libraries <a class="anchor" id="chapter1"></a>


In [46]:
import pandas as pd
import pyarrow.parquet as pa
import fastparquet as fa
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tableone import TableOne, load_dataset
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

#met onderstaande laat je functies uit andere scripts in (general functions)
#dit moet in dezelfde map opgeslagen worden.
from sklearn import metrics, datasets
# Note, '%matplotlib inline' ensures matplotlib graphs will be included in your notebook.
%matplotlib inline

# Setting Pandas options.
pd.set_option("display.max_rows", None)         # How to display all rows from data frame using pandas. Setting value to None to show all rows.
pd.set_option("display.max_columns", None)
pd.set_option("display.max_info_columns", 100)
pd.set_option("display.max_info_rows", 1000000)
pd.set_option("display.precision", 2)
#pd.set_option("styler.format.precision", 2)

# Setting Matplotlib font sizes.
FONT_SMALL = 8
FONT_MEDIUM = 16
FONT_LARGE = 20

plt.rc('axes',   titlesize      = FONT_LARGE)   # axes title
plt.rc('axes',   labelsize      = FONT_MEDIUM)  # axes x and y labels
plt.rc('xtick',  labelsize      = FONT_MEDIUM)  # x tick labels
plt.rc('ytick',  labelsize      = FONT_MEDIUM)  # y tick labels
plt.rc('legend', fontsize       = FONT_MEDIUM)  # legend items
plt.rc('legend', title_fontsize = FONT_LARGE)   # legend title
plt.rc('figure', titlesize      = FONT_LARGE)   # figure title
plt.rc('font',   size           = FONT_SMALL)   # other texts
plt.rc('figure', figsize        = (20, 10))     # figure size. This replaces 'plt.figure(figsize=(20,10))' in every cell.

# Setting sklearn parameter for Pipeline visualization.
#set_config(display="diagram")

## 2. Importing data <a class="anchor" id="chapter2"></a>

Uitgaan van de provider dataset

In [57]:
dfprov= pd.read_parquet('knee-provider.parquet')

## 3. Exploring dataset <a class="anchor" id="chapter3"></a>

In [None]:
dfprov.head(10)

In [3]:
dfprov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139236 entries, 0 to 139235
Data columns (total 81 columns):
 #   Column                        Non-Null Count   Dtype   
---  ------                        --------------   -----   
 0   provider_code                 139236 non-null  category
 1   procedure                     139236 non-null  category
 2   revision_flag                 139236 non-null  uint8   
 3   year                          139236 non-null  category
 4   age_band                      129834 non-null  category
 5   gender                        129834 non-null  float32 
 6   t0_assisted                   139236 non-null  uint8   
 7   t0_assisted_by                139236 non-null  uint8   
 8   t0_symptom_period             139236 non-null  uint8   
 9   t0_previous_surgery           139236 non-null  uint8   
 10  t0_living_arrangements        139236 non-null  uint8   
 11  t0_disability                 139236 non-null  uint8   
 12  heart_disease                 

Alle variabelen op t0 houden en t1 sucess, t1 satisfaction en oks_t1_score

In [58]:
df_t0 = dfprov.iloc[:,[0, 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,51,54,55,56,57,58,59,60,61,62,63,64,65,66,79, 40,41]].copy()

In [6]:
df_t0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139236 entries, 0 to 139235
Data columns (total 48 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   provider_code           139236 non-null  category
 1   procedure               139236 non-null  category
 2   revision_flag           139236 non-null  uint8   
 3   year                    139236 non-null  category
 4   age_band                129834 non-null  category
 5   gender                  129834 non-null  float32 
 6   t0_assisted             139236 non-null  uint8   
 7   t0_assisted_by          139236 non-null  uint8   
 8   t0_symptom_period       139236 non-null  uint8   
 9   t0_previous_surgery     139236 non-null  uint8   
 10  t0_living_arrangements  139236 non-null  uint8   
 11  t0_disability           139236 non-null  uint8   
 12  heart_disease           139236 non-null  uint8   
 13  high_bp                 139236 non-null  uint8   
 14  stro

In [5]:
df_t0.head()

Unnamed: 0,provider_code,procedure,revision_flag,year,age_band,gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,heart_disease,high_bp,stroke,circulation,lung_disease,diabetes,kidney_disease,nervous_system,liver_disease,cancer,depression,arthritis,t0_mobility,t0_self_care,t0_activity,t0_discomfort,t0_anxiety,t0_eq5d_index_profile,t0_eq5d_index,t0_eq_vas,oks_t0_pain,oks_t0_night_pain,oks_t0_washing,oks_t0_transport,oks_t0_walking,oks_t0_standing,oks_t0_limping,oks_t0_kneeling,oks_t0_work,oks_t0_confidence,oks_t0_shopping,oks_t0_stairs,oks_t0_score,oks_t1_score,t1_satisfaction,t1_sucess
0,ADP02,Knee Replacement,0,2018/19,,,2,0,2,2,2,1,1,9,9,9,9,9,9,9,9,9,9,1,2,1,2,2,1,21221,0.69,80,0,1,2,2,2,2,0,1,1,2,2,2,17.0,40.0,2,1
1,ADP02,Knee Replacement,0,2018/19,,,2,0,2,2,2,2,9,9,9,9,9,9,9,9,9,9,9,1,9,9,9,9,9,99999,,999,1,4,4,4,0,4,1,4,4,3,4,4,37.0,44.0,2,1
2,ADP02,Knee Replacement,0,2018/19,,,2,0,2,2,1,2,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,99999,,999,1,4,4,4,2,4,2,3,3,3,3,4,37.0,46.0,1,1
3,ADP02,Knee Replacement,0,2018/19,,,2,0,2,2,2,1,9,1,9,9,9,9,9,9,9,9,9,1,9,9,9,9,9,99999,,999,1,2,2,2,2,2,2,0,1,3,3,2,22.0,36.0,3,2
4,ADP02,Knee Replacement,0,2018/19,,,1,0,3,2,2,1,9,1,9,9,9,9,9,9,9,9,9,1,2,1,2,9,1,21291,,75,0,1,4,2,2,1,0,1,1,0,0,0,12.0,28.0,2,1


Replace 9 with missings

In [59]:
df_t0['t0_mobility']=df_t0['t0_mobility'].replace(9, np.NaN)
df_t0['t0_self_care']=df_t0['t0_self_care'].replace(9, np.NaN)
df_t0['t0_activity']=df_t0['t0_activity'].replace(9, np.NaN)
df_t0['t0_discomfort']=df_t0['t0_discomfort'].replace(9, np.NaN)
df_t0['t0_anxiety']=df_t0['t0_anxiety'].replace(9, np.NaN)
df_t0['t0_eq5d_index_profile']=df_t0['t0_eq5d_index_profile'].replace(99999, np.NaN)
df_t0['t0_eq_vas']=df_t0['t0_eq_vas'].replace(9, np.NaN)
df_t0['t0_assisted']=df_t0['t0_assisted'].replace(9, np.NaN)
df_t0['t0_symptom_period']=df_t0['t0_symptom_period'].replace(9, np.NaN)
df_t0['t0_previous_surgery']=df_t0['t0_previous_surgery'].replace(9, np.NaN)
df_t0['t0_living_arrangements']=df_t0['t0_living_arrangements'].replace(9, np.NaN)
df_t0['t0_disability']=df_t0['t0_disability'].replace(9, np.NaN)
df_t0['oks_t0_pain']=df_t0['oks_t0_pain'].replace(9, np.NaN)
df_t0['oks_t0_night_pain']=df_t0['oks_t0_night_pain'].replace(9, np.NaN)
df_t0['oks_t0_washing']=df_t0['oks_t0_washing'].replace(9, np.NaN)
df_t0['oks_t0_transport']=df_t0['oks_t0_transport'].replace(9, np.NaN)
df_t0['oks_t0_walking']=df_t0['oks_t0_walking'].replace(9, np.NaN)
df_t0['oks_t0_standing']=df_t0['oks_t0_standing'].replace(9, np.NaN)
df_t0['oks_t0_limping']=df_t0['oks_t0_limping'].replace(9, np.NaN)
df_t0['oks_t0_kneeling']=df_t0['oks_t0_kneeling'].replace(9, np.NaN)
df_t0['oks_t0_work']=df_t0['oks_t0_work'].replace(9, np.NaN)
df_t0['oks_t0_confidence']=df_t0['oks_t0_confidence'].replace(9, np.NaN)
df_t0['oks_t0_shopping']=df_t0['oks_t0_shopping'].replace(9, np.NaN)
df_t0['oks_t0_stairs']=df_t0['oks_t0_stairs'].replace(9, np.NaN)
df_t0['t1_satisfaction']=df_t0['t1_satisfaction'].replace(9, np.NaN)
df_t0['t1_sucess']=df_t0['t1_sucess'].replace(9, np.NaN)

In [8]:
df_t0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139236 entries, 0 to 139235
Data columns (total 48 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   provider_code           139236 non-null  category
 1   procedure               139236 non-null  category
 2   revision_flag           139236 non-null  uint8   
 3   year                    139236 non-null  category
 4   age_band                129834 non-null  category
 5   gender                  129834 non-null  float32 
 6   t0_assisted             137725 non-null  float64 
 7   t0_assisted_by          139236 non-null  uint8   
 8   t0_symptom_period       138027 non-null  float64 
 9   t0_previous_surgery     138194 non-null  float64 
 10  t0_living_arrangements  137149 non-null  float64 
 11  t0_disability           133329 non-null  float64 
 12  heart_disease           139236 non-null  uint8   
 13  high_bp                 139236 non-null  uint8   
 14  stro

In [60]:
df_t0['full_count2']=df_t0.apply(lambda x: x.count(), axis=1)

In [61]:
df_t0.full_count2.value_counts()

48    111066
46     11118
47      8316
41      3144
45      2171
37      1101
44       748
40       316
43       306
39       274
42       221
36       113
35       107
38       105
34        47
31        24
30        12
32         9
33         8
28         8
29         7
24         6
23         4
27         3
26         2
Name: full_count2, dtype: int64

In [11]:
counts = df_t0.full_count2.value_counts()
percs = df_t0.full_count2.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.concat([counts,percs], axis=1, keys=['count', 'percentage'])

Unnamed: 0,count,percentage
48,111066,79.8%
46,11118,8.0%
47,8316,6.0%
41,3144,2.3%
45,2171,1.6%
37,1101,0.8%
44,748,0.5%
40,316,0.2%
43,306,0.2%
39,274,0.2%


In [62]:
df_t0 = df_t0[df_t0['full_count2'] == 48]

In [63]:
df_t0 = df_t0.drop('full_count2', axis=1)

In [16]:
df_t0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111066 entries, 3139 to 139235
Data columns (total 48 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   provider_code           111066 non-null  category
 1   procedure               111066 non-null  category
 2   revision_flag           111066 non-null  uint8   
 3   year                    111066 non-null  category
 4   age_band                111066 non-null  category
 5   gender                  111066 non-null  float32 
 6   t0_assisted             111066 non-null  float64 
 7   t0_assisted_by          111066 non-null  uint8   
 8   t0_symptom_period       111066 non-null  float64 
 9   t0_previous_surgery     111066 non-null  float64 
 10  t0_living_arrangements  111066 non-null  float64 
 11  t0_disability           111066 non-null  float64 
 12  heart_disease           111066 non-null  uint8   
 13  high_bp                 111066 non-null  uint8   
 14  s

## 4. Defining Y <a class="anchor" id="chapter4"></a>

In [64]:
    df_t0['oks_change_score'] = df_t0.oks_t1_score - df_t0.oks_t0_score
    df_t0['oks_MID_7'] = np.where((df_t0.oks_change_score >= 7), 'CHANGE','NO_CHANGE') 
    df_t0['succesfaction_and'] = np.where((df_t0.t1_sucess > 3) & (df_t0.t1_satisfaction > 4) & (df_t0.oks_MID_7 == 'NO_CHANGE'), 'negatief_advies', 'positief_advies')
    df_t0['succesfaction_or'] = np.where((df_t0.t1_sucess > 3) | (df_t0.t1_satisfaction > 4) | (df_t0.oks_MID_7 == 'NO_CHANGE'), 'negatief_advies', 'positief_advies')
    

In [65]:
df_t0.head()

Unnamed: 0,provider_code,procedure,revision_flag,year,age_band,gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,heart_disease,high_bp,stroke,circulation,lung_disease,diabetes,kidney_disease,nervous_system,liver_disease,cancer,depression,arthritis,t0_mobility,t0_self_care,t0_activity,t0_discomfort,t0_anxiety,t0_eq5d_index_profile,t0_eq5d_index,t0_eq_vas,oks_t0_pain,oks_t0_night_pain,oks_t0_washing,oks_t0_transport,oks_t0_walking,oks_t0_standing,oks_t0_limping,oks_t0_kneeling,oks_t0_work,oks_t0_confidence,oks_t0_shopping,oks_t0_stairs,oks_t0_score,oks_t1_score,t1_satisfaction,t1_sucess,oks_change_score,oks_MID_7,succesfaction_and,succesfaction_or
3139,ADP02,Knee Replacement,0,2018/19,50 to 59,1.0,2.0,0,3.0,2.0,1.0,2.0,9,1,9,9,9,9,9,9,9,9,9,1,2.0,1.0,2.0,2.0,1.0,21221.0,0.69,60.0,0.0,0.0,3.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,3.0,20.0,30.0,3.0,3.0,10.0,CHANGE,positief_advies,positief_advies
3140,ADP02,Knee Replacement,0,2018/19,50 to 59,1.0,2.0,0,2.0,2.0,1.0,2.0,9,9,9,9,9,9,9,9,9,9,9,1,2.0,1.0,2.0,3.0,1.0,21231.0,0.16,75.0,1.0,0.0,2.0,3.0,0.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,15.0,33.0,3.0,2.0,18.0,CHANGE,positief_advies,positief_advies
3142,ADP02,Knee Replacement,0,2018/19,50 to 59,1.0,2.0,0,4.0,2.0,1.0,2.0,9,9,9,9,9,9,9,9,9,9,1,1,2.0,1.0,2.0,3.0,3.0,21233.0,-0.08,80.0,0.0,1.0,3.0,2.0,2.0,1.0,0.0,2.0,2.0,3.0,2.0,2.0,20.0,44.0,2.0,1.0,24.0,CHANGE,positief_advies,positief_advies
3143,ADP02,Knee Replacement,0,2018/19,50 to 59,1.0,2.0,0,4.0,2.0,1.0,2.0,9,9,9,9,9,9,9,9,9,9,1,1,2.0,1.0,2.0,3.0,2.0,21232.0,0.09,70.0,0.0,0.0,2.0,1.0,2.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,14.0,43.0,2.0,1.0,29.0,CHANGE,positief_advies,positief_advies
3144,ADP02,Knee Replacement,0,2018/19,50 to 59,1.0,2.0,0,4.0,2.0,1.0,1.0,9,1,9,9,9,9,9,9,9,9,9,1,2.0,1.0,2.0,3.0,2.0,21232.0,0.09,35.0,0.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0,2.0,17.0,32.0,2.0,1.0,15.0,CHANGE,positief_advies,positief_advies


In [66]:
df_t0.succesfaction_and.value_counts()

positief_advies    108790
negatief_advies      2276
Name: succesfaction_and, dtype: int64

In [67]:
counts = df_t0.succesfaction_and.value_counts()
percs = df_t0.succesfaction_and.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.concat([counts,percs], axis=1, keys=['count', 'percentage'])

Unnamed: 0,count,percentage
positief_advies,108790,98.0%
negatief_advies,2276,2.0%


In [68]:
counts = df_t0.succesfaction_or.value_counts()
percs = df_t0.succesfaction_or.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.concat([counts,percs], axis=1, keys=['count', 'percentage'])

Unnamed: 0,count,percentage
positief_advies,93401,84.1%
negatief_advies,17665,15.9%


In [69]:
counts = df_t0.oks_MID_7.value_counts()
percs = df_t0.oks_MID_7.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.concat([counts,percs], axis=1, keys=['count', 'percentage'])

Unnamed: 0,count,percentage
CHANGE,94923,85.5%
NO_CHANGE,16143,14.5%


Bij een oks van >=7 kom ik bij de AND variant op 2.0% en OR variant op 15.9%
Bij een oks van >=5 kom ik bij de AND variant op 1.9% en OR variant op 12.7%

## 5. Dummy coding <a class="anchor" id="chapter5"></a>

DUMMY CODING voor features
zie: https://towardsdatascience.com/the-best-methods-for-one-hot-encoding-your-data-c29c78a153fd
voor verschillen tussen pd.dummies, scikit learn ohe etc

In [70]:
df_t0 = df_t0.drop('provider_code', axis=1)

In [71]:
df_t0 = df_t0.drop('t0_eq5d_index_profile', axis=1)

change floats to categories

In [75]:
df_t0cat=df_t0.drop('t0_eq_vas', axis=1)
df_t0cat=df_t0cat.drop('t0_eq5d_index', axis=1)
df_t0cat=df_t0cat.drop('oks_t0_score', axis=1)
df_t0cat=df_t0cat.drop('oks_t1_score', axis=1)
df_t0cat=df_t0cat.drop('oks_MID_7', axis=1)
df_t0cat=df_t0cat.drop('succesfaction_and', axis=1)
df_t0cat=df_t0cat.drop('succesfaction_or', axis=1)
df_t0cat=df_t0cat.drop('t1_satisfaction', axis=1)
df_t0cat=df_t0cat.drop('t1_sucess', axis=1)
df_t0cat=df_t0cat.drop('oks_change_score', axis=1)


In [76]:
float_columns = df_t0cat.select_dtypes(include=['float']).columns

# Convert float columns to categories
df_t0cat[float_columns] = df_t0cat[float_columns].astype('category')

uint_columns = df_t0cat.select_dtypes(include=['uint8']).columns

# Convert uint columns to categories
df_t0cat[uint_columns] = df_t0cat[uint_columns].astype('category')

df_t0cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111066 entries, 3139 to 139235
Data columns (total 40 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   procedure               111066 non-null  category
 1   revision_flag           111066 non-null  category
 2   year                    111066 non-null  category
 3   age_band                111066 non-null  category
 4   gender                  111066 non-null  category
 5   t0_assisted             111066 non-null  category
 6   t0_assisted_by          111066 non-null  category
 7   t0_symptom_period       111066 non-null  category
 8   t0_previous_surgery     111066 non-null  category
 9   t0_living_arrangements  111066 non-null  category
 10  t0_disability           111066 non-null  category
 11  heart_disease           111066 non-null  category
 12  high_bp                 111066 non-null  category
 13  stroke                  111066 non-null  category
 14  c

In [82]:
# Identify the categorical columns
categorical_columns = df_t0cat.select_dtypes(include=['category']).columns

# Subset the DataFrame with only categorical columns
df_t0_categorical = df_t0cat[categorical_columns]

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the categorical data
encoded_data = encoder.fit_transform(df_t0_categorical)

# Create a new DataFrame with the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(df_t0_categorical.columns))

# Concatenate the encoded DataFrame with the remaining columns
df_encoded = pd.concat([df_t0cat.drop(columns=categorical_columns), encoded_df], axis=1)



In [78]:
df_encoded.head()

Unnamed: 0,procedure_Knee Replacement,revision_flag_0,revision_flag_1,year_2016/17,year_2017/18,year_2018/19,age_band_40 to 49,age_band_50 to 59,age_band_60 to 69,age_band_70 to 79,age_band_80 to 89,age_band_90 to 120,gender_1.0,gender_2.0,t0_assisted_1.0,t0_assisted_2.0,t0_assisted_by_0,t0_symptom_period_1.0,t0_symptom_period_2.0,t0_symptom_period_3.0,t0_symptom_period_4.0,t0_previous_surgery_1.0,t0_previous_surgery_2.0,t0_living_arrangements_1.0,t0_living_arrangements_2.0,t0_living_arrangements_3.0,t0_living_arrangements_4.0,t0_disability_1.0,t0_disability_2.0,heart_disease_1,heart_disease_9,high_bp_1,high_bp_9,stroke_1,stroke_9,circulation_1,circulation_9,lung_disease_1,lung_disease_9,diabetes_1,diabetes_9,kidney_disease_1,kidney_disease_9,nervous_system_1,nervous_system_9,liver_disease_1,liver_disease_9,cancer_1,cancer_9,depression_1,depression_9,arthritis_1,arthritis_9,t0_mobility_1.0,t0_mobility_2.0,t0_mobility_3.0,t0_self_care_1.0,t0_self_care_2.0,t0_self_care_3.0,t0_activity_1.0,t0_activity_2.0,t0_activity_3.0,t0_discomfort_1.0,t0_discomfort_2.0,t0_discomfort_3.0,t0_anxiety_1.0,t0_anxiety_2.0,t0_anxiety_3.0,oks_t0_pain_0.0,oks_t0_pain_1.0,oks_t0_pain_2.0,oks_t0_pain_3.0,oks_t0_pain_4.0,oks_t0_night_pain_0.0,oks_t0_night_pain_1.0,oks_t0_night_pain_2.0,oks_t0_night_pain_3.0,oks_t0_night_pain_4.0,oks_t0_washing_0.0,oks_t0_washing_1.0,oks_t0_washing_2.0,oks_t0_washing_3.0,oks_t0_washing_4.0,oks_t0_transport_0.0,oks_t0_transport_1.0,oks_t0_transport_2.0,oks_t0_transport_3.0,oks_t0_transport_4.0,oks_t0_walking_0.0,oks_t0_walking_1.0,oks_t0_walking_2.0,oks_t0_walking_3.0,oks_t0_walking_4.0,oks_t0_standing_0.0,oks_t0_standing_1.0,oks_t0_standing_2.0,oks_t0_standing_3.0,oks_t0_standing_4.0,oks_t0_limping_0.0,oks_t0_limping_1.0,oks_t0_limping_2.0,oks_t0_limping_3.0,oks_t0_limping_4.0,oks_t0_kneeling_0.0,oks_t0_kneeling_1.0,oks_t0_kneeling_2.0,oks_t0_kneeling_3.0,oks_t0_kneeling_4.0,oks_t0_work_0.0,oks_t0_work_1.0,oks_t0_work_2.0,oks_t0_work_3.0,oks_t0_work_4.0,oks_t0_confidence_0.0,oks_t0_confidence_1.0,oks_t0_confidence_2.0,oks_t0_confidence_3.0,oks_t0_confidence_4.0,oks_t0_shopping_0.0,oks_t0_shopping_1.0,oks_t0_shopping_2.0,oks_t0_shopping_3.0,oks_t0_shopping_4.0,oks_t0_stairs_0.0,oks_t0_stairs_1.0,oks_t0_stairs_2.0,oks_t0_stairs_3.0,oks_t0_stairs_4.0
3139,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3140,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3142,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3143,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3144,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [83]:
#nu de continue variabelen weer terugzetten
combined_df = pd.merge(df_encoded, df_t0[['t0_eq_vas', 't0_eq5d_index', 'oks_t0_score', 'succesfaction_or']], left_index=True, right_index=True, how='left')
#hierboven de uiteindelijk gekozen uitkomstmaat toevoegen

combined_df.info()
combined_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135059 entries, 3139 to 111061
Columns: 132 entries, procedure_Knee Replacement to succesfaction_or
dtypes: float32(2), float64(129), object(1)
memory usage: 140.0+ MB


Unnamed: 0,procedure_Knee Replacement,revision_flag_0,revision_flag_1,year_2016/17,year_2017/18,year_2018/19,age_band_40 to 49,age_band_50 to 59,age_band_60 to 69,age_band_70 to 79,age_band_80 to 89,age_band_90 to 120,gender_1.0,gender_2.0,t0_assisted_1.0,t0_assisted_2.0,t0_assisted_by_0,t0_symptom_period_1.0,t0_symptom_period_2.0,t0_symptom_period_3.0,t0_symptom_period_4.0,t0_previous_surgery_1.0,t0_previous_surgery_2.0,t0_living_arrangements_1.0,t0_living_arrangements_2.0,t0_living_arrangements_3.0,t0_living_arrangements_4.0,t0_disability_1.0,t0_disability_2.0,heart_disease_1,heart_disease_9,high_bp_1,high_bp_9,stroke_1,stroke_9,circulation_1,circulation_9,lung_disease_1,lung_disease_9,diabetes_1,diabetes_9,kidney_disease_1,kidney_disease_9,nervous_system_1,nervous_system_9,liver_disease_1,liver_disease_9,cancer_1,cancer_9,depression_1,depression_9,arthritis_1,arthritis_9,t0_mobility_1.0,t0_mobility_2.0,t0_mobility_3.0,t0_self_care_1.0,t0_self_care_2.0,t0_self_care_3.0,t0_activity_1.0,t0_activity_2.0,t0_activity_3.0,t0_discomfort_1.0,t0_discomfort_2.0,t0_discomfort_3.0,t0_anxiety_1.0,t0_anxiety_2.0,t0_anxiety_3.0,oks_t0_pain_0.0,oks_t0_pain_1.0,oks_t0_pain_2.0,oks_t0_pain_3.0,oks_t0_pain_4.0,oks_t0_night_pain_0.0,oks_t0_night_pain_1.0,oks_t0_night_pain_2.0,oks_t0_night_pain_3.0,oks_t0_night_pain_4.0,oks_t0_washing_0.0,oks_t0_washing_1.0,oks_t0_washing_2.0,oks_t0_washing_3.0,oks_t0_washing_4.0,oks_t0_transport_0.0,oks_t0_transport_1.0,oks_t0_transport_2.0,oks_t0_transport_3.0,oks_t0_transport_4.0,oks_t0_walking_0.0,oks_t0_walking_1.0,oks_t0_walking_2.0,oks_t0_walking_3.0,oks_t0_walking_4.0,oks_t0_standing_0.0,oks_t0_standing_1.0,oks_t0_standing_2.0,oks_t0_standing_3.0,oks_t0_standing_4.0,oks_t0_limping_0.0,oks_t0_limping_1.0,oks_t0_limping_2.0,oks_t0_limping_3.0,oks_t0_limping_4.0,oks_t0_kneeling_0.0,oks_t0_kneeling_1.0,oks_t0_kneeling_2.0,oks_t0_kneeling_3.0,oks_t0_kneeling_4.0,oks_t0_work_0.0,oks_t0_work_1.0,oks_t0_work_2.0,oks_t0_work_3.0,oks_t0_work_4.0,oks_t0_confidence_0.0,oks_t0_confidence_1.0,oks_t0_confidence_2.0,oks_t0_confidence_3.0,oks_t0_confidence_4.0,oks_t0_shopping_0.0,oks_t0_shopping_1.0,oks_t0_shopping_2.0,oks_t0_shopping_3.0,oks_t0_shopping_4.0,oks_t0_stairs_0.0,oks_t0_stairs_1.0,oks_t0_stairs_2.0,oks_t0_stairs_3.0,oks_t0_stairs_4.0,t0_eq_vas,t0_eq5d_index,oks_t0_score,succesfaction_or
3139,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,60.0,0.69,20.0,positief_advies
3140,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,75.0,0.16,15.0,positief_advies
3142,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,80.0,-0.08,20.0,positief_advies
3143,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,70.0,0.09,14.0,positief_advies
3144,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,35.0,0.09,17.0,positief_advies
