In [1]:
import findspark
import pyspark as ps
import warnings
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql import SparkSession

# creating the spark context
try:
    # create SparkContext on all CPUs available: in our case we have 4 CPUs on my laptop
    sc = ps.SparkContext('local[4]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")


print( "Running Spark Version %s" % (sc.version))

Just created a SparkContext
Running Spark Version 2.4.4


In [2]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header = True, inferSchema = True).load('inpatient_charges_cleaned.csv')
type(df)

pyspark.sql.dataframe.DataFrame

In [5]:
df = ps.read.format("mongo").load()

AttributeError: module 'pyspark' has no attribute 'read'

In [7]:
df.show(5)

+--------------------+-----------+--------------------+-----------------------+-------------+--------------+-----------------+------------------------------------+----------------+-----------------------+----------------------+-------------------------+
|      drg_definition|provider_id|       provider_name|provider_street_address|provider_city|provider_state|provider_zip_code|hospital_referral_region_description|total_discharges|average_covered_charges|average_total_payments|average_medicare_payments|
+--------------------+-----------+--------------------+-----------------------+-------------+--------------+-----------------+------------------------------------+----------------+-----------------------+----------------------+-------------------------+
|039 - EXTRACRANIA...|      10001|SOUTHEAST ALABAMA...|   1108 ROSS CLARK C...|       DOTHAN|            AL|            36301|                         AL - Dothan|              91|               32963.07|               5777.24|           

In [8]:
df.schema

StructType(List(StructField(drg_definition,StringType,true),StructField(provider_id,IntegerType,true),StructField(provider_name,StringType,true),StructField(provider_street_address,StringType,true),StructField(provider_city,StringType,true),StructField(provider_state,StringType,true),StructField(provider_zip_code,IntegerType,true),StructField(hospital_referral_region_description,StringType,true),StructField(total_discharges,IntegerType,true),StructField(average_covered_charges,DoubleType,true),StructField(average_total_payments,DoubleType,true),StructField(average_medicare_payments,DoubleType,true)))

In [9]:
df.dropna() # drop columns with null values

DataFrame[drg_definition: string, provider_id: int, provider_name: string, provider_street_address: string, provider_city: string, provider_state: string, provider_zip_code: int, hospital_referral_region_description: string, total_discharges: int, average_covered_charges: double, average_total_payments: double, average_medicare_payments: double]

In [10]:
df.count()

163065

In [29]:
#df.describe('provider_id').show()

In [30]:
#df.select('provider_id').show()

In [11]:
df.fillna(-1) # fill null values with -1

DataFrame[drg_definition: string, provider_id: int, provider_name: string, provider_street_address: string, provider_city: string, provider_state: string, provider_zip_code: int, hospital_referral_region_description: string, total_discharges: int, average_covered_charges: double, average_total_payments: double, average_medicare_payments: double]

In [12]:
df.printSchema()

root
 |-- drg_definition: string (nullable = true)
 |-- provider_id: integer (nullable = true)
 |-- provider_name: string (nullable = true)
 |-- provider_street_address: string (nullable = true)
 |-- provider_city: string (nullable = true)
 |-- provider_state: string (nullable = true)
 |-- provider_zip_code: integer (nullable = true)
 |-- hospital_referral_region_description: string (nullable = true)
 |-- total_discharges: integer (nullable = true)
 |-- average_covered_charges: double (nullable = true)
 |-- average_total_payments: double (nullable = true)
 |-- average_medicare_payments: double (nullable = true)



In [13]:
df.columns

['drg_definition',
 'provider_id',
 'provider_name',
 'provider_street_address',
 'provider_city',
 'provider_state',
 'provider_zip_code',
 'hospital_referral_region_description',
 'total_discharges',
 'average_covered_charges',
 'average_total_payments',
 'average_medicare_payments']

In [18]:
df.select(['drg_definition']).distinct().show()

+--------------------+
|      drg_definition|
+--------------------+
|811 - RED BLOOD C...|
|329 - MAJOR SMALL...|
|191 - CHRONIC OBS...|
|683 - RENAL FAILU...|
|918 - POISONING &...|
|481 - HIP & FEMUR...|
|249 - PERC CARDIO...|
|390 - G.I. OBSTRU...|
|300 - PERIPHERAL ...|
|057 - DEGENERATIV...|
|190 - CHRONIC OBS...|
|243 - PERMANENT C...|
|252 - OTHER VASCU...|
|314 - OTHER CIRCU...|
|207 - RESPIRATORY...|
|247 - PERC CARDIO...|
|690 - KIDNEY & UR...|
|286 - CIRCULATORY...|
|292 - HEART FAILU...|
|482 - HIP & FEMUR...|
+--------------------+
only showing top 20 rows



In [18]:
dict1=df.head(2)[0].asDict()

In [19]:
dict1

{'drg_definition': '039 - EXTRACRANIAL PROCEDURES W/O CC/MCC',
 'provider_id': 10001,
 'provider_name': 'SOUTHEAST ALABAMA MEDICAL CENTER',
 'provider_street_address': '1108 ROSS CLARK CIRCLE',
 'provider_city': 'DOTHAN',
 'provider_state': 'AL',
 'provider_zip_code': 36301,
 'hospital_referral_region_description': 'AL - Dothan',
 'total_discharges': 91,
 'average_covered_charges': 32963.07,
 'average_total_payments': 5777.24,
 'average_medicare_payments': 4763.73}

In [15]:
spark1 = SparkSession.builder.appName('SQL').getOrCreate()

In [16]:
df.createOrReplaceTempView('InpatientCharges')

In [17]:
query = spark1.sql('SELECT * FROM InpatientCharges LIMIT 5')

In [19]:
query.columns

['drg_definition',
 'provider_id',
 'provider_name',
 'provider_street_address',
 'provider_city',
 'provider_state',
 'provider_zip_code',
 'hospital_referral_region_description',
 'total_discharges',
 'average_covered_charges',
 'average_total_payments',
 'average_medicare_payments']

In [20]:
df.repartition(10).rdd.getNumPartitions()

10

In [21]:
df.distinct().count() 

163065

In [23]:
import pandas
from pyspark.sql.functions import pandas_udf, PandasUDFType



In [32]:
#drop duplicate columns
df.dropDuplicates()
#df.dropDuplicates().show()

DataFrame[drg_definition: string, provider_id: int, provider_name: string, provider_street_address: string, provider_city: string, provider_state: string, provider_zip_code: int, hospital_referral_region_description: string, total_discharges: int, average_covered_charges: double, average_total_payments: double, average_medicare_payments: double]

In [18]:
#  import PyMongo library - a MongoDB connector

import pymongo
from pymongo import MongoClient
print ('mongo version', pymongo.__version__)
client = MongoClient('localhost',27017)
db = client.test1
collection = db.Inpatient_Charges

cursor =collection.find().sort('provider_id',pymongo.ASCENDING).limit(10)
for data in cursor:
    print(data)


mongo version 3.10.1
{'_id': ObjectId('5e2cff5683233d8df63c0a91'), 'drg_definition': '', 'provider_id': None, 'provider_name': None, 'provider_street_address': None, 'provider_city': None, 'provider_state': None, 'provider_zip_code': None, 'hospital_referral_region_description': None, 'total_discharges': None, 'average_covered_charges': None, 'average_total_payments': None, 'average_medicare_payments': None}
{'_id': ObjectId('5e2cff5683233d8df63c3519'), 'drg_definition': '', 'provider_id': None, 'provider_name': None, 'provider_street_address': None, 'provider_city': None, 'provider_state': None, 'provider_zip_code': None, 'hospital_referral_region_description': None, 'total_discharges': None, 'average_covered_charges': None, 'average_total_payments': None, 'average_medicare_payments': None}
{'_id': ObjectId('5e2cff5783233d8df63caa01'), 'drg_definition': '', 'provider_id': None, 'provider_name': None, 'provider_street_address': None, 'provider_city': None, 'provider_state': None, 'prov

SyntaxError: invalid syntax (<ipython-input-15-ae52fd1d09f9>, line 1)

In [4]:
# ./bin/pyspark --conf "spark.mongodb.input.uri=mongodb://127.0.0.1/test1.pos?readPreference=primaryPreferred" \
#               --conf "spark.mongodb.output.uri=mongodb://127.0.0.1/test1.pos" \
#               --packages org.mongodb.spark:mongo-spark-connector_2.10:1.1.0
from pyspark.sql import SparkSession

#import os
#os.environ['PYSPARK_SUBMIT_ARGS'] = 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1 pyspark-shell'

sp = SparkSession \
.builder \
.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test1.pos") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test1.pos") \
.config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1") \
.getOrCreate()

df = sp.read.format('com.mongodb.spark.sql.DefaultSource').load()
# df.write.format('com.mongodb.spark.sql.DefaultSource').mode('append').save()
# sc1 = SQLContext(my_spark.sparkContext)

Py4JJavaError: An error occurred while calling o67.load.
: java.lang.ClassNotFoundException: Failed to find data source: com.mongodb.spark.sql.DefaultSource. Please find packages at http://spark.apache.org/third-party-projects.html
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:657)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:194)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: com.mongodb.spark.sql.DefaultSource.DefaultSource
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20$$anonfun$apply$12.apply(DataSource.scala:634)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20$$anonfun$apply$12.apply(DataSource.scala:634)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20.apply(DataSource.scala:634)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20.apply(DataSource.scala:634)
	at scala.util.Try.orElse(Try.scala:84)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:634)
	... 13 more


Py4JJavaError: An error occurred while calling o37.load.
: java.lang.ClassNotFoundException: Failed to find data source: com.mongodb.spark.sql.DefaultSource. Please find packages at http://spark.apache.org/third-party-projects.html
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:657)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:194)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: com.mongodb.spark.sql.DefaultSource.DefaultSource
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20$$anonfun$apply$12.apply(DataSource.scala:634)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20$$anonfun$apply$12.apply(DataSource.scala:634)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20.apply(DataSource.scala:634)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20.apply(DataSource.scala:634)
	at scala.util.Try.orElse(Try.scala:84)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:634)
	... 13 more


In [3]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1'


In [None]:
df.write.format('com.mongodb.spark.sql.DefaultSource').mode('append').save()