In [204]:
#----------Python Libs
#  Unlike SAS and SQL, Python (& R) require you to install and import libs.
#  Which are essentially globl and system macros OR functions
#  Some essential Python Libs:https://medium.com/activewizards-machine-learning-company/top-15-python-libraries-for-data-science-in-in-2017-ab61b4f9b4a7

# Import is essentially bringing in a library and assigning in an Alias
from pandasql import sqldf #!pip install -U pandasql
pysqldf = lambda q: sqldf(q, globals())
from collections import Counter

import pandas as pd #Pandas Lib
import numpy as np  #NumPy Lib
import matplotlib.pyplot as plt

In [None]:
#----------Reading Data into Python
#   Though there are multiple data mediums and repositories, I only plan to cover one simple data source such as CSV.
#   However, each lib has documentation on how to connect to RDBMS, Hadoop, or other environments. 
#   SAS uses 'proc import' with various options to accomplish the exact same things
#   Hive & other SQL flavors: 
#   drop table something; CREATE TABLE something (TRANS_CONTROL_NUM string ,...) 
#   ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; 
#   LOAD DATA INPATH "path" INTO TABLE something;

#a) Create a CSV from an existing Data Frame ('save as' in a working directory)
path = '/data01/readmits.csv' #you can simply put a working path in place of  '/data01/readmits.csv'
df.to_csv(path)

#b) Read a CSV 
df = pd.read_csv(path)

#c) Read a CSV with no column names/headers
df = pd.read_csv(path , header=None)

#d) Read a CSV whilst defining column names
df = pd.read_csv(path, names=['mem_id', 'admit_dt', 'discharge_dt', 'Age', 'ICD1_DX', 'dis_disp'])

#e) Read while specifying "." for missing (there is a whole section on missing)
df = pd.read_csv(path, na_values=['.'])

In [225]:
#Let's read in our own data set from the plethora of data available on the Govt's website: 
#   data owned and sourced from: https://www.data.gov/
url = "https://data.medicare.gov/api/views/9n3s-kdb3/rows.csv?accessType=DOWNLOAD" 
life = pd.read_csv(url)

In [182]:
#----------Contents of data set(s)
#SAS & SQL: 'proc contents' & Hadoop - describe 'table'
life.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19878 entries, 0 to 19877
Data columns (total 12 columns):
Hospital Name                 19878 non-null object
Provider Number               19878 non-null int64
State                         19878 non-null object
Measure Name                  19878 non-null object
Number of Discharges          19878 non-null object
Footnote                      5563 non-null float64
Excess Readmission Ratio      19878 non-null object
Predicted Readmission Rate    19878 non-null object
Expected Readmission Rate     19878 non-null object
Number of Readmissions        19878 non-null object
Start Date                    19878 non-null object
End Date                      19878 non-null object
dtypes: float64(1), int64(1), object(10)
memory usage: 1.8+ MB


In [273]:
#----------First 'n' data records
#print first 10 records
#SQL: select * from life limit 10 (Hive) OR 'proc print' in SAS
life.head(10)

#Select specific columns to look at
life[['Measure Name', 'Hospital Name']].head(10)

#View the first two records in a data set or matric
life[:2]

#Same example as 1 but I am going to use SQL instead 
q ="""SELECT * FROM life limit 5;"""  
df = pysqldf(q) 
df.head(10) 

Unnamed: 0,Hospital Name,Provider Number,State,Measure Name,Number of Discharges,Footnote,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,Start Date,End Date
0,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,781.0,,0.9837,15.358,15.6121,119,2012-07-01 00:00:00.000000,2015-06-30 00:00:00.000000
1,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,273.0,,1.0618,13.8887,13.0809,40,2012-07-01 00:00:00.000000,2015-06-30 00:00:00.000000
2,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-COPD-HRRP,709.0,,1.0455,19.7525,18.8932,143,2012-07-01 00:00:00.000000,2015-06-30 00:00:00.000000
3,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,983.0,,0.9509,20.2502,21.2964,196,2012-07-01 00:00:00.000000,2015-06-30 00:00:00.000000
4,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-HIP-KNEE-HRRP,335.0,,1.1198,5.6025,5.0034,21,2012-07-01 00:00:00.000000,2015-06-30 00:00:00.000000


In [274]:
#----------Column & Row views 
df = life

#View a column
df['State']

#View 2+ columns
df[['State', 'Provider Number']]

#View Row... not the best way but a demonstration
df.ix[1]

Hospital Name                 SOUTHEAST ALABAMA MEDICAL CENTER
Provider Number                                          10001
State                                                       AL
Measure Name                                READM-30-CABG-HRRP
Number of Discharges                                     273.0
Footnote                                                   NaN
Excess Readmission Ratio                                1.0618
Predicted Readmission Rate                             13.8887
Expected Readmission Rate                              13.0809
Number of Readmissions                                      40
Start Date                                 2012-07-01 00:00:00
End Date                                   2015-06-30 00:00:00
Name: 1, dtype: object

In [242]:
#----------Converting variables (strings, numbers, etc.)
# The CSV read stored most variables as objects and 
df = life
cols_to_convert = ['Number of Discharges', 'Expected Readmission Rate']
cols_to_convert

#--TO NUMERIC
#  You'll have to iterate over each column in the newly defined data frame ('df') 
#  where columns that are converted come from cols_to_convert
for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')
#You can do ALL columns to numeric: df.apply(pd.to_numeric)
## Did it work?
df.dtypes

#--TO CHARACTER
# let's reverse this
for col in cols_to_convert:
    df[col] =  df[col].astype(str) 
## Did it work?
df.dtypes

#--TO DATE
df = life
cols_to_convert = ['Start Date', 'End Date']
#  You'll have to iterate over each column in the newly defined data frame ('df') 
#  where columns that are converted come from cols_to_convert
for col in cols_to_convert:
    df[col] = pd.to_datetime(df[col], errors='coerce')    

Hospital Name                         object
Provider Number                        int64
State                                 object
Measure Name                          object
Number of Discharges                  object
Footnote                             float64
Excess Readmission Ratio              object
Predicted Readmission Rate            object
Expected Readmission Rate             object
Number of Readmissions                object
Start Date                    datetime64[ns]
End Date                      datetime64[ns]
dtype: object

In [272]:
#----------Data Dimmensions & Indexes
# Number of Records
life.shape[0] #Number of Rows
life.shape[1] #Number of Columns

# Count the number of observations by STATE (unique hospitals)
df=life.groupby('State').count()
df.head(3)

# which state (indexed) greatest number of discharges
df['Number of Discharges'].idxmax()

# List Unique Values
life['State'].unique() 

#Re-Indexing (This is used in SAS or SQL data management ie. updating or mappings)
x =life['State'].unique() 

#What is the data set indexed on?
life.index

#What index values do we currently have for 'df'
df.index.values

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object)

In [275]:
#----------How to handle MISSING values
# isnull()  > generates a boolean mask to indicate missing values
# notnull() > opposite of isnull()
# dropna()  > returns a filtered version of the data
# fillna()  > returns a copy of data with missing values filled or imputed

#--isnull()  > generates a boolean mask to indicate missing values
#Give me the number of missing values by field
for x in life.columns:
    print (x, end="~~Number of Missing Values >")
    print (sum(df[x].isnull()))
#Want to see how it's done in the background?
df_null = life.isnull()
df_null.head(5)

#--dropna()  > returns a filtered version of the data
#  Drop missing rows which have missing values
df_no_missing = life.dropna()
#Drop rows where all cells in that row is NA
df_cleaned = life.dropna(how='all')
df_cleaned.head(5)

#--fillna()  > returns a copy of data with missing values filled or imputed
#Create a new column full of missing values or set it to something else number or string
df=life
#with some value
df['test_c'] = "Over"
df.head(5)
#with NaN
df=life
df['test_c'] = np.nan
df.head(5)
#Fill in missing data with zeros
df = df.fillna(0)
df.head(10)

Hospital Name~~Number of Missing Values >0
Provider Number~~Number of Missing Values >0
State~~Number of Missing Values >0
Measure Name~~Number of Missing Values >0
Number of Discharges~~Number of Missing Values >0
Footnote~~Number of Missing Values >14315
Excess Readmission Ratio~~Number of Missing Values >0
Predicted Readmission Rate~~Number of Missing Values >0
Expected Readmission Rate~~Number of Missing Values >0
Number of Readmissions~~Number of Missing Values >0
Start Date~~Number of Missing Values >0
End Date~~Number of Missing Values >0


Unnamed: 0,Hospital Name,Provider Number,State,Measure Name,Number of Discharges,Footnote,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,Start Date,End Date,test_c
0,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,781.0,0.0,0.9837,15.3580,15.6121,119,2012-07-01,2015-06-30,0.0
1,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,273.0,0.0,1.0618,13.8887,13.0809,40,2012-07-01,2015-06-30,0.0
2,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-COPD-HRRP,709.0,0.0,1.0455,19.7525,18.8932,143,2012-07-01,2015-06-30,0.0
3,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,983.0,0.0,0.9509,20.2502,21.2964,196,2012-07-01,2015-06-30,0.0
4,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-HIP-KNEE-HRRP,335.0,0.0,1.1198,5.6025,5.0034,21,2012-07-01,2015-06-30,0.0
5,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-PN-HRRP,664.0,0.0,1.0892,17.2654,15.8507,119,2012-07-01,2015-06-30,0.0
6,MARSHALL MEDICAL CENTER SOUTH,10005,AL,READM-30-AMI-HRRP,,0.0,0.9905,16.3958,16.5529,Too Few to Report,2012-07-01,2015-06-30,0.0
7,MARSHALL MEDICAL CENTER SOUTH,10005,AL,READM-30-CABG-HRRP,,5.0,Not Available,Not Available,,Not Available,2012-07-01,2015-06-30,0.0
8,MARSHALL MEDICAL CENTER SOUTH,10005,AL,READM-30-COPD-HRRP,667.0,0.0,0.9014,17.0536,18.9196,107,2012-07-01,2015-06-30,0.0
9,MARSHALL MEDICAL CENTER SOUTH,10005,AL,READM-30-HF-HRRP,389.0,0.0,1.0001,21.5918,21.5891,84,2012-07-01,2015-06-30,0.0


In [289]:
#----------Drop Rows and Columns
#Drop an record/row
df=life
df.shape[0] #Number of Rows 19878
df.drop([1, 2]).shape[0] #Number of Rows 19878-2 = 19876

#Drop a variable (column) | axis=1 tells Python I am referring to a column
df.shape[1] #Current Number of Columns = 13
df.drop('State', axis=1).shape[1] #New Number of Columns = 12

#Drop records that contain a value like 'AL'
df[df['State'] != 'AL'].head(5)

Unnamed: 0,Hospital Name,Provider Number,State,Measure Name,Number of Discharges,Footnote,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,Start Date,End Date,test_c
498,PROVIDENCE ALASKA MEDICAL CENTER,20001,AK,READM-30-AMI-HRRP,385.0,,0.8867,12.9298,14.5825,43,2012-07-01,2015-06-30,
499,PROVIDENCE ALASKA MEDICAL CENTER,20001,AK,READM-30-CABG-HRRP,,,0.9064,12.044,13.287,Too Few to Report,2012-07-01,2015-06-30,
500,PROVIDENCE ALASKA MEDICAL CENTER,20001,AK,READM-30-COPD-HRRP,261.0,,0.9985,18.4289,18.457,48,2012-07-01,2015-06-30,
501,PROVIDENCE ALASKA MEDICAL CENTER,20001,AK,READM-30-HF-HRRP,385.0,,0.8827,18.8314,21.3343,65,2012-07-01,2015-06-30,
502,PROVIDENCE ALASKA MEDICAL CENTER,20001,AK,READM-30-HIP-KNEE-HRRP,528.0,,0.9606,3.7449,3.8984,19,2012-07-01,2015-06-30,
