In [3]:

# Import the liberaries: 
import pandas as pd
import numpy as np


In [4]:
# read the database from the computer 
ltc_df=pd.read_csv("data/ltcdsl_covid data.csv")

In [5]:
# to expand the number of columns and rows that we can see while dispalying a datafram
pd.set_option('display.max_columns', ltc_df.columns.shape[0]+1)
pd.set_option('display.max_rows', 200)

# 1- Cleaning the data 

In [6]:
# The size of the database
ltc_df.shape

(25586, 84)

In [7]:
# To see the types of each varaible  and whether or not we have null values 
ltc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25586 entries, 0 to 25585
Data columns (total 84 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   spec_pat_num_age                  25585 non-null  float64
 1   patient_gender                    25586 non-null  object 
 2   chf_2_years_full                  25586 non-null  bool   
 3   card_arrh_2_years_full            25586 non-null  bool   
 4   valv_dis_2_years_full             25586 non-null  bool   
 5   pcd_2_years_full                  25586 non-null  bool   
 6   pvd_2_years_full                  25586 non-null  bool   
 7   htn_unc_2_years_full              25586 non-null  bool   
 8   htn_c_2_years_full                25586 non-null  bool   
 9   paral_2_years_full                25586 non-null  bool   
 10  oth_neur_dis_2_years_full         25586 non-null  bool   
 11  cpd_2_years_full                  25586 non-null  bool   
 12  diab

In [8]:
# number of individuals in the Database 
len(ltc_df)

25586


### 1.1- Dummy variable

We need to convert the type of the following featurs to Bool
* 'in_icu_at_60_days', 
* 'in_cv_icu_at_60_days', 
* 'in_hospital_at_60_days', 
* 'in_ltc_at_60_days' , 
* 'in_dsl_at_60_days'
* "died_within_60_days" 
* "any_hospital_admits_within_60d" with "yes" and "no" values need to be boolean with the True and False values 

In [9]:
# The following features should be bool, but they are objects in the dataframe 
in_at_66=['in_icu_at_60_days', 'in_cv_icu_at_60_days', 'in_hospital_at_60_days', 'in_ltc_at_60_days' , 'in_dsl_at_60_days']              
# so we need to change the type 
for each in in_at_66:
    ltc_df[each] = ltc_df[each].apply(lambda x: str(x).replace('True', "1"))
    ltc_df[each] = ltc_df[each].apply(lambda x: x.replace('False', ""))
    ltc_df[each] = ltc_df[each].apply(lambda x: x.replace('nan', ""))
    ltc_df[each] = ltc_df[each].apply(lambda x: bool(x))
    

ltc_df['died_within_60_days'] = ltc_df.died_within_60_days.apply(lambda x: x.strip().replace('Yes', "1"))
ltc_df['died_within_60_days'] = ltc_df.died_within_60_days.apply(lambda x: x.strip().replace('No', ""))
ltc_df['died_within_60_days'] = ltc_df.died_within_60_days.apply(lambda x: bool(x))

ltc_df['any_hospital_admits_within_60d'] = ltc_df.any_hospital_admits_within_60d.apply(lambda x: x.strip().replace('Yes', "1"))
ltc_df['any_hospital_admits_within_60d'] = ltc_df.any_hospital_admits_within_60d.apply(lambda x: x.strip().replace('No', ""))
ltc_df['any_hospital_admits_within_60d'] = ltc_df.any_hospital_admits_within_60d.apply(lambda x: bool(x))


# To work with the boolean data and visualize them, we changed the boolean to int

# select the columns which have the data type of boolean
col_bool=ltc_df.select_dtypes(include=bool).columns

# change the datatype from bool to int
ltc_df[col_bool]=ltc_df[col_bool].apply(lambda x: x.astype('int32'))


### 1.2- missing values  and Dealing with null values


In [10]:
# to sum the values with null for each column 
ltc_df.isna().sum()[ltc_df.isna().sum()!=0]

spec_pat_num_age                        1
num_procs_nacrs_1_year              10961
days_to_first_unplanned_ambulato    21777
days_to_first_hospital_admit        23078
days_to_first_icu_admit             25478
days_to_first_cv_icu_admit          25574
days_to_first_ventilation           25547
days_to_death                       22996
death_location                      22996
status_at_60_days                   21230
los_for_first_admission             23078
riw_for_first_admission             23078
symptomatic_during_collection        4821
dtype: int64

### Percentage of the null values for each feature with null values 

In [11]:
# calculate the percentage of the null values 
(ltc_df.isna().sum()[ltc_df.isna().sum()!=0])*100/25586

spec_pat_num_age                     0.003908
num_procs_nacrs_1_year              42.839834
days_to_first_unplanned_ambulato    85.112952
days_to_first_hospital_admit        90.197764
days_to_first_icu_admit             99.577894
days_to_first_cv_icu_admit          99.953099
days_to_first_ventilation           99.847573
days_to_death                       89.877277
death_location                      89.877277
status_at_60_days                   82.975064
los_for_first_admission             90.197764
riw_for_first_admission             90.197764
symptomatic_during_collection       18.842336
dtype: float64


#### 1.2.1-  riw_for_first_admission 
 As the lowest score for RIW is "0.105100" and we dont have any zero value, we will fill the null values for RIW with <font color='red'>zero</font>. 

In [12]:
# to see the lowest score for the riw 
ltc_df['riw_for_first_admission'].value_counts().sort_index().head(2)

0.1051    2
0.1379    1
Name: riw_for_first_admission, dtype: int64

In [13]:
# fill the null values with 0
ltc_df['riw_for_first_admission'][ltc_df['riw_for_first_admission'].isna()]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['riw_for_first_admission'][ltc_df['riw_for_first_admission'].isna()]=0


#### 1.2.2-  los_for_first_admission 
Adding 1 to all the values that are not null. As zero value shows that there are at least one admission, but they did not stay for one day(it may took for a few hours). 


los is for the people who have had at least one admision after the test results. We have already checked that any_hospitial_admits within_60 days will be one for the people with the los != null. If the los is null the any_hopitial_admits within_60 is 0. 

In [14]:
print("## unique values for the any_hospital_admits_within_60d for the ones with los not equal to null is one ")
print(ltc_df['any_hospital_admits_within_60d'][ltc_df['los_for_first_admission'].notna()].unique())

print("## unique values for the any_hospital_admits_within_60d for the ones with los equal to null is zero ")
print(ltc_df['any_hospital_admits_within_60d'][ltc_df['los_for_first_admission'].isna()].unique())

## unique values for the any_hospital_admits_within_60d for the ones with los not equal to null is one 
[1]
## unique values for the any_hospital_admits_within_60d for the ones with los equal to null is zero 
[0]


In [15]:
# add one to the not null values 
ltc_df['los_for_first_admission'][ltc_df['los_for_first_admission'].notna()]=ltc_df['los_for_first_admission'][ltc_df['los_for_first_admission'].notna()]+1

# fill the null values with zero
ltc_df['los_for_first_admission'][ltc_df['los_for_first_admission'].isna()]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['los_for_first_admission'][ltc_df['los_for_first_admission'].notna()]=ltc_df['los_for_first_admission'][ltc_df['los_for_first_admission'].notna()]+1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['los_for_first_admission'][ltc_df['los_for_first_admission'].isna()]=0


#### 1.2.3. days_to_first_hospital_admit
0: means, they had one visit for the hospital for the test covid (took few hours). 

-1: they have admitted to the hospital then tested for covid. 

The nan values are the ones who did not have any visit to the hospital. So it makes sense that we dont have any value for them. In that case as 0 or -1 have meanings, for the nans, we replace the values with the -2. 

-2: means, the individual did not have any visits for the hospital after the test covid.

In [16]:
print("## unique values for the any_hospital_admits_within_60d for the ones with days_to_first_hospital_admit not equal to null is one ")
print(ltc_df['any_hospital_admits_within_60d'][ltc_df['days_to_first_hospital_admit'].notna()].unique())
print("## unique values for the any_hospital_admits_within_60d for the ones with days_to_first_hospital_admit equal to null is zero ")
print(ltc_df['any_hospital_admits_within_60d'][ltc_df['days_to_first_hospital_admit'].isna()].unique())

## unique values for the any_hospital_admits_within_60d for the ones with days_to_first_hospital_admit not equal to null is one 
[1]
## unique values for the any_hospital_admits_within_60d for the ones with days_to_first_hospital_admit equal to null is zero 
[0]


In [17]:
# fill the null values with -2
ltc_df['days_to_first_hospital_admit'][ltc_df['days_to_first_hospital_admit'].isna()]=-2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['days_to_first_hospital_admit'][ltc_df['days_to_first_hospital_admit'].isna()]=-2


#### 1.2.4. days_to_first_icu_admit


0: means, they had one visit for the icu for the test covid (took few hours). 

-1: they have admitted to the icu then tested for covid. 

The nan values are the ones who did not have any visit to the icu. So it makes sense that we dont have any value for them. In that case as 0 or -1 have meanings, for the nans, we replace the values with the -2. 

-2: means, the individual did not have any visits for the icu after the test covid.

In [18]:
print("## unique values for the any_icu_admits_within_60d for the ones with days_to_first_icu_admit not equal to null is one ")
print(ltc_df['any_icu_admit_within_60_days'][ltc_df['days_to_first_icu_admit'].notna()].unique())
print("## unique values for the any_icu_admits_within_60d for the ones with days_to_first_icu_admit equal to null is zero ")
print(ltc_df['any_icu_admit_within_60_days'][ltc_df['days_to_first_icu_admit'].isna()].unique())

## unique values for the any_icu_admits_within_60d for the ones with days_to_first_icu_admit not equal to null is one 
[1]
## unique values for the any_icu_admits_within_60d for the ones with days_to_first_icu_admit equal to null is zero 
[0]


In [19]:
# fill the null values with -2
ltc_df['days_to_first_icu_admit'][ltc_df['days_to_first_icu_admit'].isna()]=-2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['days_to_first_icu_admit'][ltc_df['days_to_first_icu_admit'].isna()]=-2


#### 1.2.5. days_to_first_cv_icu_admit

0: means, they had one visit for the icu for the test covid (took few hours). 

-1: they have admitted to the icu then tested for covid. 

The nan values are the ones who did not have any visit to the icu. So it makes sense that we dont have any value for them. In that case as 0 or -1 have meanings, for the nans, we replace the values with the -2. 

-2: means, the individual did not have any visits for the icu after the test covid.

In [20]:
print("## unique values for the any_cv_icu_admit_within_60_days for the ones with days_to_first_cv_icu_admit not equal to null is one ")
print(ltc_df['any_cv_icu_admit_within_60_days'][ltc_df['days_to_first_cv_icu_admit'].notna()].unique())
print("## unique values for the any_cv_icu_admit_within_60_days for the ones with days_to_first_cv_icu_admit equal to null is zero ")
print(ltc_df['any_cv_icu_admit_within_60_days'][ltc_df['days_to_first_cv_icu_admit'].isna()].unique())

## unique values for the any_cv_icu_admit_within_60_days for the ones with days_to_first_cv_icu_admit not equal to null is one 
[1]
## unique values for the any_cv_icu_admit_within_60_days for the ones with days_to_first_cv_icu_admit equal to null is zero 
[0]


In [21]:
# fill the null values with -2
ltc_df['days_to_first_cv_icu_admit'][ltc_df['days_to_first_cv_icu_admit'].isna()]=-2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['days_to_first_cv_icu_admit'][ltc_df['days_to_first_cv_icu_admit'].isna()]=-2


#### 1.2.6. days_to_first_ventilation

0: means, they had one visit for the ventilation for the test covid (took few hours). 

-1: they have admitted to the ventilation then tested for covid. 

The nan values are the ones who did not have any visit to the ventilation. So it makes sense that we dont have any value for them. In that case as 0 or -1 have meanings, for the nans, we replace the values with the -2. 

-2: means, the individual did not have any visits for the ventilation after the test covid.

In [22]:
print("## unique values for the any_ventilation_admits_within_60d for the ones with days_to_first_ventilation_admit not equal to null is one ")
print(ltc_df['any_ventilations_within_60_days'][ltc_df['days_to_first_ventilation'].notna()].unique())
print("## unique values for the any_ventilation_admits_within_60d for the ones with days_to_first_ventilation_admit equal to null is zero ")
print(ltc_df['any_ventilations_within_60_days'][ltc_df['days_to_first_ventilation'].isna()].unique())

## unique values for the any_ventilation_admits_within_60d for the ones with days_to_first_ventilation_admit not equal to null is one 
[1]
## unique values for the any_ventilation_admits_within_60d for the ones with days_to_first_ventilation_admit equal to null is zero 
[0]


In [23]:
# fill the null values with -2
ltc_df['days_to_first_ventilation'][ltc_df['days_to_first_ventilation'].isna()]=-2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['days_to_first_ventilation'][ltc_df['days_to_first_ventilation'].isna()]=-2


#### 1.2.7. days_to_death

0: means, they died the same day as they tested for covid (took few hours). 

-1: they have died. After that they tested for covid. 

The nan values are the ones who are alive. So it makes sense that we dont have any value for them. In that case as 0 or -1 have meanings, for the nans, we replace the values with the -2. 

-2: means, the individual are alive.

In [24]:
print("## unique values for the died_within_60_days for the ones with days_to_death not equal to null is one ")
print(ltc_df['died_within_60_days'][ltc_df['days_to_death'].notna()].unique())
print("## unique values for the died_within_60_days for the ones with days_to_death equal to null is zero ")
print(ltc_df['died_within_60_days'][ltc_df['days_to_death'].isna()].unique())

## unique values for the died_within_60_days for the ones with days_to_death not equal to null is one 
[1]
## unique values for the died_within_60_days for the ones with days_to_death equal to null is zero 
[0]


In [25]:
# fill the null values with -2
ltc_df['days_to_death'][ltc_df['days_to_death'].isna()]=-2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['days_to_death'][ltc_df['days_to_death'].isna()]=-2


#### 1.2.8. status_at_60_days


The Nan values replaced with unknown.

In [26]:
# fill the null values with unknown
ltc_df['status_at_60_days'][ltc_df['status_at_60_days'].isna()]='unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['status_at_60_days'][ltc_df['status_at_60_days'].isna()]='unknown'


#### 1.2.9. num_procs_nacrs_1_year
As for the nan variables the number of procedure for the dad is equel to the total procedures, we can assume, there is zero procedures for these people in Nacrs. However, as we have zero in the dataset, we will fill the nan values with -1. 

In [27]:
# fill the null values with -1
ltc_df['num_procs_nacrs_1_year'][ltc_df['num_procs_nacrs_1_year'].isna()]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['num_procs_nacrs_1_year'][ltc_df['num_procs_nacrs_1_year'].isna()]=0


#### 1.2.10. days_to_first_unplanned_ambulato
0: means, they had one visit for the ambulary for the test covid (took few hours).

-1: they have admitted to the ambulatory then tested for covid.

The nan values are the ones who did not have any visit to the ambulatory. So it makes sense that we dont have any value for them. In that case as 0 or -1 have meanings, for the nans, we replace the values with the -2.

-2: means, the individual did not have any visits for the ambulatory after the test covid.


In [28]:
# fill the null values with -2
ltc_df['days_to_first_unplanned_ambulato'][ltc_df['days_to_first_unplanned_ambulato'].isna()]=-2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['days_to_first_unplanned_ambulato'][ltc_df['days_to_first_unplanned_ambulato'].isna()]=-2


#### 1.2.11- Dealing with null values in age (spec_pat_num_age)
There is one null value in the age. But we have spec_pat_agecat, which is the category age. We will fill tha null by puting the median of the age in that category. 


In [29]:
# fill the null value with the age of 25.
ltc_df.loc[ltc_df['spec_pat_num_age'].isna(),'spec_pat_num_age']=25

#### 1.2.12- Dealing with null values in symptomatic during collection column 

There are 4821 rows with the nan value, and 4381 has negative results. 
<font color='red'> I have filled them with 'N"? which means negative</font>

and for the ones who have  nan in the symptomatic, and the test result is positive, I have filled them with<font color='red'> 'U' which is undefined  </font>

In [30]:
mask_negative=(ltc_df['symptomatic_during_collection'].isna()) & (ltc_df['interp_result']=='Negative')
ltc_df.loc[mask_negative,['symptomatic_during_collection']]='N'

In [31]:
# select the positive calsese with the null syptomatic.
mask_positive=(ltc_df['symptomatic_during_collection'].isna()) & (ltc_df['interp_result']=='Positive')
# fill the null values with 'U' which means unknown
ltc_df.loc[mask_positive,['symptomatic_during_collection']]='U'
# ltc_df[mask_positive]

#### 1.2.13. death_location

For all of the nan values, we have checked the died location in hospital, ltc, dsl, or others, and all of them are zero.

After checking died_within_60_days feature, for the ones who have the nan values in the death location, this feature is zero for all the people with the nan value for the death location.

The next phase is looking at the status_at_60_days to see what is their statuts at the 60th day for the people with nan values. Based on this feature, non of them are died. So, we assume that these people are alive and that is the reason why there is no value for them. 

so we will fill the nan values with 'A' that means alive. 

In [32]:
# fill the null values with 'A'
ltc_df['death_location'][ltc_df['death_location'].isna()]='A'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltc_df['death_location'][ltc_df['death_location'].isna()]='A'


### 1.2.13. num_elixhauser_2_years_full_cat
In num_elixhauser_2_years_full_cat, some values are "no contacts" that means, no comorbidities. So we need to change it to Zero

In [33]:
ltc_df['num_elixhauser_2_years_full_cat']=ltc_df['num_elixhauser_2_years_full_cat'].apply(lambda x: '0' if x== 'No contacts' else x)

In [34]:
ltc_df['num_elixhauser_2_years_full_cat'].unique()

array(['1+', '0'], dtype=object)

### There is no Null values in the database anymore


In [35]:
# check for the missing values if it is null, that means we covered them all 
ltc_df.isna().sum()[ltc_df.isna().sum()!=0]

Series([], dtype: int64)

## 1.3- Dealling with Outliers 

In [36]:
# removing one individual with the age of 17
ltc_df.drop(ltc_df[ltc_df['spec_pat_num_age']==17].index, inplace=True)

# removing the ones with the sex of unknown
ltc_df.drop(ltc_df[ltc_df['patient_gender']=='U'].index, inplace=True)

### What are characteristics of LTC and DSL residents?

In [37]:
total_dsl_res=len(ltc_df[ltc_df['dsl_resident_during_collection']==1])
total_ltc_res=len(ltc_df[ltc_df['dsl_resident_during_collection']==0])

In [38]:
# Sex ratio for ltc residents
ltc_df[ltc_df['dsl_resident_during_collection']==1].patient_gender.value_counts()/total_dsl_res

F    0.663976
M    0.336024
Name: patient_gender, dtype: float64

In [39]:
# sex ratio for dsl residents
ltc_df[ltc_df['dsl_resident_during_collection']==0].patient_gender.value_counts()/total_ltc_res

F    0.613105
M    0.386895
Name: patient_gender, dtype: float64

In [40]:
# age category for the ltc residents
ltc_df[ltc_df['dsl_resident_during_collection']==1].spec_pat_agecat.value_counts()/total_dsl_res

80+      0.660558
70-79    0.192271
60-69    0.092290
50-59    0.039119
40-49    0.009590
30-39    0.005507
18-29    0.000665
Name: spec_pat_agecat, dtype: float64

In [41]:
# age category for the dsl residents
ltc_df[ltc_df['dsl_resident_during_collection']==0].spec_pat_agecat.value_counts()/total_ltc_res

80+      0.645335
70-79    0.193780
60-69    0.097222
50-59    0.040537
40-49    0.013690
30-39    0.006778
18-29    0.002658
Name: spec_pat_agecat, dtype: float64

In [42]:
# results for the ltc residents 
ltc_df[ltc_df['dsl_resident_during_collection']==1].interp_result.value_counts()/total_dsl_res

Negative    0.874383
Positive    0.125617
Name: interp_result, dtype: float64

In [43]:
# results for the dsl residents
ltc_df[ltc_df['dsl_resident_during_collection']==0].interp_result.value_counts()/total_ltc_res

Negative    0.825758
Positive    0.174242
Name: interp_result, dtype: float64

In [44]:
# symptomatic_during_collection for the ltc residents
ltc_df[ltc_df['dsl_resident_during_collection']==1].symptomatic_during_collection.value_counts()/total_dsl_res

N    0.772503
U    0.124478
Y    0.103019
Name: symptomatic_during_collection, dtype: float64

In [45]:
# symptomatic_during_collection for the dsl residents
ltc_df[ltc_df['dsl_resident_during_collection']==0].symptomatic_during_collection.value_counts()/total_ltc_res

N    0.739766
U    0.144737
Y    0.115497
Name: symptomatic_during_collection, dtype: float64

In [46]:
# num_elixhauser_2_years_full_cat for the ltc residents
ltc_df[ltc_df['dsl_resident_during_collection']==1].num_elixhauser_2_years_full_cat.value_counts()/total_dsl_res

1+    0.945594
0     0.054406
Name: num_elixhauser_2_years_full_cat, dtype: float64

In [47]:
# num_elixhauser_2_years_full_cat for the dsl residents
ltc_df[ltc_df['dsl_resident_during_collection']==0].num_elixhauser_2_years_full_cat.value_counts()/total_ltc_res

1+    0.922315
0     0.077685
Name: num_elixhauser_2_years_full_cat, dtype: float64

In [48]:
# cancer_mets_2_years_full for the ltc residents
ltc_df[ltc_df['dsl_resident_during_collection']==1].cancer_mets_2_years_full.value_counts()/total_dsl_res

0    0.986992
1    0.013008
Name: cancer_mets_2_years_full, dtype: float64

In [49]:
# cancer_mets_2_years_full for the dsl residents
ltc_df[ltc_df['dsl_resident_during_collection']==0].cancer_mets_2_years_full.value_counts()/total_ltc_res

0    0.983054
1    0.016946
Name: cancer_mets_2_years_full, dtype: float64

In [50]:
# liver_dis_2_years_full for the ltc residents
ltc_df[ltc_df['dsl_resident_during_collection']==1].liver_dis_2_years_full.value_counts()/total_dsl_res

0    0.973414
1    0.026586
Name: liver_dis_2_years_full, dtype: float64

In [51]:
# liver_dis_2_years_full for the dsl residents
ltc_df[ltc_df['dsl_resident_during_collection']==0].liver_dis_2_years_full.value_counts()/total_ltc_res

0    0.972687
1    0.027313
Name: liver_dis_2_years_full, dtype: float64

# Save the pre-processed data file.

In [52]:
ltc_df.to_hdf("ltc_df.h5",key='ltc_df')