In [7]:
#Import Libs: 
!pip install -U pandasql from pandasql import sqldf pysqldf = lambda q: sqldf(q, globals()) #SQL LIBS
import pandas as pd #Pandas Lib
import numpy as np  #NumPy Lib

#Life expectancy at birth in years from: https://www.data.gov/
url = "https://data.medicare.gov/api/views/9n3s-kdb3/rows.csv?accessType=DOWNLOAD" 
life = pd.read_csv(url)

In [85]:
#~~~~~~~~~Understanding your Data~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#1: print first 10 records
#SQL: select * from life limit 10 (Hive)
life.head(10)

#Same example as 1 but I am going to use SQL insteal 
q ="""SELECT * FROM life limit 5;"""  
df = pysqldf(q) 
df.head(5) 

Unnamed: 0,Hospital Name,Provider Number,State,Measure Name,Number of Discharges,Footnote,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,Start Date,End Date
0,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,781,,0.9837,15.358,15.6121,119,01-JUL-12,30-JUN-15
1,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,273,,1.0618,13.8887,13.0809,40,01-JUL-12,30-JUN-15
2,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-COPD-HRRP,709,,1.0455,19.7525,18.8932,143,01-JUL-12,30-JUN-15
3,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,983,,0.9509,20.2502,21.2964,196,01-JUL-12,30-JUN-15
4,SOUTHEAST ALABAMA MEDICAL CENTER,10001,AL,READM-30-HIP-KNEE-HRRP,335,,1.1198,5.6025,5.0034,21,01-JUL-12,30-JUN-15


In [43]:
#2: List variables in table
#SAS & SQL: 'proc contents' & Hadoop - describe 'table'
life.info()
contents = life.info() #assign data set to session table

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19878 entries, 0 to 19877
Data columns (total 12 columns):
Hospital Name                 19878 non-null object
Provider Number               19878 non-null int64
State                         19878 non-null object
Measure Name                  19878 non-null object
Number of Discharges          19878 non-null object
Footnote                      5563 non-null float64
Excess Readmission Ratio      19878 non-null object
Predicted Readmission Rate    19878 non-null object
Expected Readmission Rate     19878 non-null object
Number of Readmissions        19878 non-null object
Start Date                    19878 non-null object
End Date                      19878 non-null object
dtypes: float64(1), int64(1), object(10)
memory usage: 1.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19878 entries, 0 to 19877
Data columns (total 12 columns):
Hospital Name                 19878 non-null object
Provider Number               19878 non-null

In [44]:
#3 What is the number of rows & columns in the dataset
#SAS and SQL: 'select count (*) from life' 
life.shape[0] #Number of Rows
life.shape[1] #Number of Columns

12

In [45]:
#4 Give the column names
#SAS 'proc contents w/ "short" option'
life.columns

Index(['Hospital Name', 'Provider Number', 'State', 'Measure Name',
       'Number of Discharges', 'Footnote', 'Excess Readmission Ratio',
       'Predicted Readmission Rate', 'Expected Readmission Rate',
       'Number of Readmissions', 'Start Date', 'End Date'],
      dtype='object')

In [46]:
#5 What is the data set indexed on?
life.index

RangeIndex(start=0, stop=19878, step=1)

In [132]:
#6 Which hospital name has the most discharges
## Notes: Before you do this you'll need to change to a the number of discharges to a numeric variable
##        Recall that 'float64'  is an option but pandas already has a to_numeric function
##        In SAS you can use  new_variable = input(original_variable, informat.) or depending on your SQL flavor to_char
##        courtesy: https://stackoverflow.com/questions/36874246/convert-a-column-in-pandas-dataframe-from-string-to-float

df = life
cols_to_convert = ['Number of Discharges', 'Expected Readmission Rate']
cols_to_convert

#  You'll have to iterate over each column in the newly defined data frame ('df') 
#  where columns that are converted come from cols_to_convert

for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    
## Did it work?
df.dtypes
df.head(5)

##Note: pandas group by + sum will only agregate based on float 64 variables (note I only converted 2) 
y = df.groupby('Hospital Name').sum()
y = y.sort_values(cols_to_convert, ascending=False) 
y.head(5)

##You can do this using pandas sql lib as well:
#Same example as 1 but I am going to use SQL insteal 
q ="""SELECT  "Hospital Name", 
               sum("Number of Discharges") as num_disch
      FROM df 
      group by "Hospital Name"
      order by  num_disch desc;
  """  
df = pysqldf(q) 
df.head(5) 

Unnamed: 0,Hospital Name,num_disch
0,FLORIDA HOSPITAL,14138.0
1,GOOD SAMARITAN HOSPITAL,12972.0
2,METHODIST HOSPITAL,12076.0
3,"CHRISTIANA CARE HEALTH SERVICES, INC.",11294.0
4,ST JOSEPH MEDICAL CENTER,10493.0
