In [1]:
import findspark
import pyspark as ps
import warnings
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql import SparkSession

# creating the spark context
try:
    # create SparkContext on all CPUs available: in our case we have 4 CPUs on my laptop
    sc = ps.SparkContext('local[4]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")




Just created a SparkContext


In [2]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header = True, inferSchema = True).load('inpatient_charges_cleaned.csv')
type(df)

pyspark.sql.dataframe.DataFrame

In [3]:
df

DataFrame[drg_definition: string, provider_id: int, provider_name: string, provider_street_address: string, provider_city: string, provider_state: string, provider_zip_code: int, hospital_referral_region_description: string, total_discharges: int, average_covered_charges: double, average_total_payments: double, average_medicare_payments: double]

In [4]:
df.show(5)

+--------------------+-----------+--------------------+-----------------------+-------------+--------------+-----------------+------------------------------------+----------------+-----------------------+----------------------+-------------------------+
|      drg_definition|provider_id|       provider_name|provider_street_address|provider_city|provider_state|provider_zip_code|hospital_referral_region_description|total_discharges|average_covered_charges|average_total_payments|average_medicare_payments|
+--------------------+-----------+--------------------+-----------------------+-------------+--------------+-----------------+------------------------------------+----------------+-----------------------+----------------------+-------------------------+
|039 - EXTRACRANIA...|      10001|SOUTHEAST ALABAMA...|   1108 ROSS CLARK C...|       DOTHAN|            AL|            36301|                         AL - Dothan|              91|               32963.07|               5777.24|           

In [5]:
df.schema

StructType(List(StructField(drg_definition,StringType,true),StructField(provider_id,IntegerType,true),StructField(provider_name,StringType,true),StructField(provider_street_address,StringType,true),StructField(provider_city,StringType,true),StructField(provider_state,StringType,true),StructField(provider_zip_code,IntegerType,true),StructField(hospital_referral_region_description,StringType,true),StructField(total_discharges,IntegerType,true),StructField(average_covered_charges,DoubleType,true),StructField(average_total_payments,DoubleType,true),StructField(average_medicare_payments,DoubleType,true)))

In [6]:
df.dropna() # drop columns with null values
df.show()

+--------------------+-----------+--------------------+-----------------------+-------------+--------------+-----------------+------------------------------------+----------------+-----------------------+----------------------+-------------------------+
|      drg_definition|provider_id|       provider_name|provider_street_address|provider_city|provider_state|provider_zip_code|hospital_referral_region_description|total_discharges|average_covered_charges|average_total_payments|average_medicare_payments|
+--------------------+-----------+--------------------+-----------------------+-------------+--------------+-----------------+------------------------------------+----------------+-----------------------+----------------------+-------------------------+
|039 - EXTRACRANIA...|      10001|SOUTHEAST ALABAMA...|   1108 ROSS CLARK C...|       DOTHAN|            AL|            36301|                         AL - Dothan|              91|               32963.07|               5777.24|           

In [7]:
df.count()

163065

In [8]:
df.describe('provider_id').show()

+-------+------------------+
|summary|       provider_id|
+-------+------------------+
|  count|            163065|
|   mean|255569.86542789685|
| stddev|151563.67176686018|
|    min|             10001|
|    max|            670077|
+-------+------------------+



In [9]:
df.select('provider_id').show()

+-----------+
|provider_id|
+-----------+
|      10001|
|      10005|
|      10006|
|      10011|
|      10016|
|      10023|
|      10029|
|      10033|
|      10039|
|      10040|
|      10046|
|      10055|
|      10056|
|      10078|
|      10083|
|      10085|
|      10090|
|      10092|
|      10100|
|      10103|
+-----------+
only showing top 20 rows



In [10]:
df.fillna(-1).show() # fill null values with -1

+--------------------+-----------+--------------------+-----------------------+-------------+--------------+-----------------+------------------------------------+----------------+-----------------------+----------------------+-------------------------+
|      drg_definition|provider_id|       provider_name|provider_street_address|provider_city|provider_state|provider_zip_code|hospital_referral_region_description|total_discharges|average_covered_charges|average_total_payments|average_medicare_payments|
+--------------------+-----------+--------------------+-----------------------+-------------+--------------+-----------------+------------------------------------+----------------+-----------------------+----------------------+-------------------------+
|039 - EXTRACRANIA...|      10001|SOUTHEAST ALABAMA...|   1108 ROSS CLARK C...|       DOTHAN|            AL|            36301|                         AL - Dothan|              91|               32963.07|               5777.24|           

In [11]:
df.printSchema()

root
 |-- drg_definition: string (nullable = true)
 |-- provider_id: integer (nullable = true)
 |-- provider_name: string (nullable = true)
 |-- provider_street_address: string (nullable = true)
 |-- provider_city: string (nullable = true)
 |-- provider_state: string (nullable = true)
 |-- provider_zip_code: integer (nullable = true)
 |-- hospital_referral_region_description: string (nullable = true)
 |-- total_discharges: integer (nullable = true)
 |-- average_covered_charges: double (nullable = true)
 |-- average_total_payments: double (nullable = true)
 |-- average_medicare_payments: double (nullable = true)



In [12]:
df.columns

['drg_definition',
 'provider_id',
 'provider_name',
 'provider_street_address',
 'provider_city',
 'provider_state',
 'provider_zip_code',
 'hospital_referral_region_description',
 'total_discharges',
 'average_covered_charges',
 'average_total_payments',
 'average_medicare_payments']

In [13]:
dict1=df.head(2)[0].asDict()

In [14]:
dict1

{'drg_definition': '039 - EXTRACRANIAL PROCEDURES W/O CC/MCC',
 'provider_id': 10001,
 'provider_name': 'SOUTHEAST ALABAMA MEDICAL CENTER',
 'provider_street_address': '1108 ROSS CLARK CIRCLE',
 'provider_city': 'DOTHAN',
 'provider_state': 'AL',
 'provider_zip_code': 36301,
 'hospital_referral_region_description': 'AL - Dothan',
 'total_discharges': 91,
 'average_covered_charges': 32963.07,
 'average_total_payments': 5777.24,
 'average_medicare_payments': 4763.73}

In [15]:
spark1 = SparkSession.builder.appName('SQL').getOrCreate()

In [16]:
df.createOrReplaceTempView('InpatientCharges')

In [17]:
query = spark1.sql('SELECT * FROM InpatientCharges LIMIT 5')

In [18]:
query

DataFrame[drg_definition: string, provider_id: int, provider_name: string, provider_street_address: string, provider_city: string, provider_state: string, provider_zip_code: int, hospital_referral_region_description: string, total_discharges: int, average_covered_charges: double, average_total_payments: double, average_medicare_payments: double]

In [19]:
query.columns

['drg_definition',
 'provider_id',
 'provider_name',
 'provider_street_address',
 'provider_city',
 'provider_state',
 'provider_zip_code',
 'hospital_referral_region_description',
 'total_discharges',
 'average_covered_charges',
 'average_total_payments',
 'average_medicare_payments']

In [20]:
df.repartition(10).rdd.getNumPartitions()

10

In [21]:
df.distinct().count() 

163065

In [23]:
import pandas
from pyspark.sql.functions import pandas_udf, PandasUDFType

