# Texas Hospital Discharge - Import

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas_profiling

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

import glob, os
SEED = 42

## Download Files

In [2]:
for d in ["src", "data", "doc", "output"]:
    os.makedirs(d, exist_ok=True)

In [3]:
URL = "https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/05-Assignment/01-Specification/files/"

files = "my_lib.py train.csv.gz grading.csv.gz Facility_type1q2013_tab.zip Facility_type2q2013_tab.zip Facility_type3q2013_tab.zip Facility_type4q2013_tab.zip UserManual1Q2013.pdf"

for filename in files.split(" "):
    
    ext = filename.split(".")[-1]
    dest = {"pdf":"doc", "py":".", "ipynb":".", "gz":"src", "zip":"src"}[ext]
   
    source = f"{URL}/{filename}"
    target = f"{dest}/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename} in folder {dest}")

Downloading remote file my_lib.py
Downloading remote file train.csv.gz
Downloading remote file grading.csv.gz
Downloading remote file Facility_type1q2013_tab.zip
Downloading remote file Facility_type2q2013_tab.zip
Downloading remote file Facility_type3q2013_tab.zip
Downloading remote file Facility_type4q2013_tab.zip
Downloading remote file UserManual1Q2013.pdf


In [57]:
df = pd.read_csv(f"src/train.csv.gz", dtype=str)
df.shape

(1000000, 194)

## Deleting Columns if half of its rows is empty

In [71]:
# code below prints a list of all the columns containing a null value
null_list = df.columns[df.isnull().any()].tolist()

# how much of our data is missing?
isnull_count = df.isnull().sum().sort_values(ascending=False).head()

In [62]:
# Dropping irrelevant columns to reduce size of dataset
nl = []
for i in df.columns:
    if df[i].isnull().sum() >= 50000:
        nl.append(i)
print(df.shape)

(1000000, 194)


In [64]:
# Dropping irrelevant columns 
print(f'Dropping {len(nl)} columns..')
df = df.drop(columns=nl)   
display(df.shape)

Dropping 153 columns..


(1000000, 41)

## Construct Target 

In [65]:
df.dropna(subset=["LENGTH_OF_STAY"], inplace=True)
df.LENGTH_OF_STAY = df.LENGTH_OF_STAY.astype(int)

In [66]:
df["TARGET"] = df.LENGTH_OF_STAY.apply(lambda x: "short" if x<3 else ("medium" if x<=6 else "long"))
df.TARGET.value_counts(dropna=False)

short     414152
medium    381437
long      204109
Name: TARGET, dtype: int64

In [73]:
# Shuffle or randomize the dataframe
df_sample = df.sample(frac=1, random_state=SEED)
print(df.shape)

(999698, 42)


## Split 1M rows in out data into 20 smaller csv files 

In [68]:
parts = 20
nrows = df.shape[0] // parts
for k in range(parts):
    filename = ('data/df_train_sample_%02d_of_%d.csv' % (k, parts))
    print(filename)
    df_sample.iloc[k*nrows:(k+1)*nrows].to_csv(filename, index=False)

# for parts in [40,20,10,5,2,1]:
#     nrows = df.shape[0] // parts
#     filename = ('data/df_train_sample_%02d_of_%d.csv' % (k, parts))
#     print(filename)
#     df_sample.iloc[k*nrows:(k+1)*nrows].to_csv(filename, index=False)

data/df_train_sample_00_of_20.csv
data/df_train_sample_01_of_20.csv
data/df_train_sample_02_of_20.csv
data/df_train_sample_03_of_20.csv
data/df_train_sample_04_of_20.csv
data/df_train_sample_05_of_20.csv
data/df_train_sample_06_of_20.csv
data/df_train_sample_07_of_20.csv
data/df_train_sample_08_of_20.csv
data/df_train_sample_09_of_20.csv
data/df_train_sample_10_of_20.csv
data/df_train_sample_11_of_20.csv
data/df_train_sample_12_of_20.csv
data/df_train_sample_13_of_20.csv
data/df_train_sample_14_of_20.csv
data/df_train_sample_15_of_20.csv
data/df_train_sample_16_of_20.csv
data/df_train_sample_17_of_20.csv
data/df_train_sample_18_of_20.csv
data/df_train_sample_19_of_20.csv


## Grading Datset

In [69]:
df_grading = pd.read_csv(f"src/grading.csv.gz", dtype=str)

In [70]:
df_grading.to_csv("data/grading.csv", index=False)