# Texas Hospital Discharge - Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas_profiling

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

import glob, os
SEED = 42
feature_labels = {}

## Download Files

In [None]:
for d in ["src", "data", "doc", "output"]:
    os.makedirs(d, exist_ok=True)

In [None]:
URL = "https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/05-Assignment/01-Specification/files/"

files = "my_lib.py train.csv.gz grading.csv.gz Facility_type1q2013_tab.zip Facility_type2q2013_tab.zip Facility_type3q2013_tab.zip Facility_type4q2013_tab.zip UserManual1Q2013.pdf"

for filename in files.split(" "):
    
    ext = filename.split(".")[-1]
    dest = {"pdf":"doc", "py":".", "ipynb":".", "gz":"src", "zip":"src"}[ext]
   
    source = f"{URL}/{filename}"
    target = f"{dest}/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename} in folder {dest}")

In [None]:
df = pd.read_csv(f"src/train.csv.gz", dtype=str)
df_grading = pd.read_csv(f"src/grading.csv.gz", dtype=str)

print(df.shape)
print(df_grading.shape)

## Deleting Columns if half of its rows is empty

In [5]:
null_list = []
for i in df.columns:
    if df[i].isnull().sum() >= 50000 and i != "SEX_CODE":
        null_list.append(i)

In [6]:
# Dropping irrelevant columns to reduce size of dataset
def drop_irrelevant_columns(df_in):
    # Dropping irrelevant columns 
    print(f'\nDropping {len(null_list)} columns ....')
    new_df = df_in.drop(columns=null_list)   
    print(f"New Dataframe: {new_df.shape}")
    
    return new_df

In [7]:
# Delete all columns where sum of null values >= 50,000
df = drop_irrelevant_columns(df)
df_grading = drop_irrelevant_columns(df_grading)


Dropping 152 columns ....
New Dataframe: (1000000, 42)

Dropping 152 columns ....
New Dataframe: (100000, 41)


## Construct Target 

In [8]:
df.dropna(subset=["LENGTH_OF_STAY"], inplace=True)
df.LENGTH_OF_STAY = df.LENGTH_OF_STAY.astype(int)

In [9]:
df["TARGET"] = df.LENGTH_OF_STAY.apply(lambda x: "short" if x<3 else ("medium" if x<=6 else "long"))
df.TARGET.value_counts(dropna=False)

short     414152
medium    381437
long      204109
Name: TARGET, dtype: int64

In [10]:
# Shuffle or randomize the dataframe
df_sample = df.sample(frac=1, random_state=SEED)
print(df_sample.shape)

(999698, 43)


## Split 1M rows in out data into 20 smaller csv files 

In [11]:
parts = 20
nrows = df.shape[0] // parts
for k in range(parts):
    filename = ('data/df_train_sample_%02d_of_%d.csv' % (k, parts))
    print(filename)
    df_sample.iloc[k*nrows:(k+1)*nrows].to_csv(filename, index=False)

# for parts in [40,20,10,5,2,1]:
#     nrows = df.shape[0] // parts
#     filename = ('data/df_train_sample_%02d_of_%d.csv' % (k, parts))
#     print(filename)
#     df_sample.iloc[k*nrows:(k+1)*nrows].to_csv(filename, index=False)

data/df_train_sample_00_of_20.csv
data/df_train_sample_01_of_20.csv
data/df_train_sample_02_of_20.csv
data/df_train_sample_03_of_20.csv
data/df_train_sample_04_of_20.csv
data/df_train_sample_05_of_20.csv
data/df_train_sample_06_of_20.csv
data/df_train_sample_07_of_20.csv
data/df_train_sample_08_of_20.csv
data/df_train_sample_09_of_20.csv
data/df_train_sample_10_of_20.csv
data/df_train_sample_11_of_20.csv
data/df_train_sample_12_of_20.csv
data/df_train_sample_13_of_20.csv
data/df_train_sample_14_of_20.csv
data/df_train_sample_15_of_20.csv
data/df_train_sample_16_of_20.csv
data/df_train_sample_17_of_20.csv
data/df_train_sample_18_of_20.csv
data/df_train_sample_19_of_20.csv


## Saving Grading Datset

In [12]:
df_grading.to_csv("data/grading.csv", index=False)

## Facility Dataset

In [13]:
# Import facility
df_facility_raw = pd.read_csv("src/Facility_type1q2013_tab.zip", sep="\t")
df_facility_quarters = [pd.read_csv("src/Facility_type%sq2013_tab.zip" % k, sep="\t") for k in range(1,4)]

In [14]:
# First clean: drop column and fill in null values
df_facility = pd.concat(df_facility_quarters, ignore_index=True)
df_facility.drop(columns=["Unnamed: 10"], inplace=True)
df_facility.fillna(0, inplace=True)

In [15]:
display(df_facility.head())
display(df_facility.shape)

Unnamed: 0,THCIC_ID,PROVIDER_NAME,FAC_TEACHING_IND,FAC_PSYCH_IND,FAC_REHAB_IND,FAC_ACUTE_CARE_IND,FAC_SNF_IND,FAC_LONG_TERM_AC_IND,FAC_OTHER_LTC_IND,FAC_PEDS_IND
0,100,Austin State Hospital,0,X,0,0,0,0,0,0
1,101,Big Spring State Hospital,0,X,0,0,0,0,0,0
2,102,UT Medical Branch Hospital,A,0,X,X,0,0,0,X
3,104,Rio Grande State Center,0,X,0,0,0,0,0,0
4,105,UT MD Anderson Cancer Center,A,0,0,X,0,0,0,0


(1748, 10)

In [16]:
# Second clean: drop duplicates
df_facility.drop_duplicates(subset="THCIC_ID", keep="last", inplace=True)
display(df_facility.head())
display(df_facility.shape)

Unnamed: 0,THCIC_ID,PROVIDER_NAME,FAC_TEACHING_IND,FAC_PSYCH_IND,FAC_REHAB_IND,FAC_ACUTE_CARE_IND,FAC_SNF_IND,FAC_LONG_TERM_AC_IND,FAC_OTHER_LTC_IND,FAC_PEDS_IND
347,724900,Brownsville Doctors Hospital,0,0,0,X,0,0,0,X
499,854000,Twin Creeks Hospital,0,0,X,0,0,0,0,0
531,907000,Renaissance Hospital-Groves,0,0,0,X,0,0,0,0
544,939000,GlobalRehab Hospital-San Antonio,0,0,X,0,0,0,0,0
983,798500,Austin Surgical Hospital,0,0,0,0,0,0,0,X


(597, 10)

In [17]:
# Third clean: binarize row values
def binarize(txt):
    bin_swap = {"0":0, "A":1, "C":1, "X":1, "x":1}
    return int(txt.replace(txt, str(bin_swap[txt])))
    
for col in df_facility.columns[2:]:
    df_facility[col] = df_facility[col].apply(lambda txt: binarize(str(txt))) 

In [18]:
# save cleaned facility dataset
df_facility.to_csv("data/facility.csv", index=False)
df_facility.head()

Unnamed: 0,THCIC_ID,PROVIDER_NAME,FAC_TEACHING_IND,FAC_PSYCH_IND,FAC_REHAB_IND,FAC_ACUTE_CARE_IND,FAC_SNF_IND,FAC_LONG_TERM_AC_IND,FAC_OTHER_LTC_IND,FAC_PEDS_IND
347,724900,Brownsville Doctors Hospital,0,0,0,1,0,0,0,1
499,854000,Twin Creeks Hospital,0,0,1,0,0,0,0,0
531,907000,Renaissance Hospital-Groves,0,0,0,1,0,0,0,0
544,939000,GlobalRehab Hospital-San Antonio,0,0,1,0,0,0,0,0
983,798500,Austin Surgical Hospital,0,0,0,0,0,0,0,1
