# CS 520 Data Curation Project - ETL Pipeline using Spark, Pandas and MongoDB

## Our goal for the project is to Extract data from a CSV file. 
## Transform the original schema of the file into the required schema with lesser number of columns and with column names changed. 
## Explore and Query the Data. 
## Load the data as JSON Documents in a Mongo-DB JSON Table. 
## All this is done using Spark and Pandas in Python.



## We have used the Medicare Open payments data from a CSV file.
## Requirements: Python, Spark, JupyterNotebook, MongoDB Connection.

## Easy way to install spark for jupyter notebook if not already present 

In [None]:
import sys
!{sys.executable} -m pip install pyspark

## Importing SparkSession

In [None]:
from pyspark.sql import SparkSession

## Building SparkSession 

In [None]:
spark = SparkSession.builder.appName('CS 520').getOrCreate()

## reading the csv File 

In [None]:
df = spark.read.csv("payments.csv", header = True)

## Print the original schema 

In [None]:
df.printSchema()

## Changing the data type of Amount from String to Double  

In [None]:
from pyspark.sql.types import DoubleType


In [None]:
df2 = df.withColumn("amount" , df["Total_Amount_of_Payment_USDollars"].cast(DoubleType()))

## Creating a Temporary Payments 

In [None]:
df2.createGlobalTempView("payments1")

## We can also specify the schema while importing the file in the below manner 

In [None]:
from pyspark.sql.types import StructField,StringType,IntegerType,StructType

In [None]:
data_schema = [StructField("physician_id", StringType(), True),StructField("date_payment", StringType(), True),StructField("record_id", StringType(), True),StructField("payer", StringType(), True),StructField("amount", DoubleType(), True),StructField("physician_speciality", StringType(), True),StructField("nature_of_payment", StringType(), True)]

In [None]:
final_struc = StructType(fields=data_schema)

## Selecting only the columns we want and also renaming the Columns as we want 

In [None]:
ds = spark.sql("select Physician_Profile_ID as physician_id,Date_of_Payment as date_payment, Record_ID as record_id, Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name as payer,amount, Physician_Specialty, Nature_of_Payment_or_Transfer_of_Value as Nature_of_payment from global_temp.payments1 where Physician_Profile_ID IS NOT NULL") 

## Required Schema 

In [None]:
ds.printSchema()

In [None]:
ds.first()

## Replacing the temporary view with our new view 

In [None]:
ds.createOrReplaceGlobalTempView("payments")

## Sample data 

In [None]:
ds.show()

## Changing the type of date from string to date format and also changing format from mm/dd/yyyy to yyyy-mm-dd (unix timestamp) format

In [None]:
from pyspark.sql.functions import to_date
from pyspark.sql.functions import unix_timestamp

ds =ds.withColumn("date_payment", to_date(unix_timestamp(ds["date_payment"], "MM/dd/yyyy").cast("timestamp")))


In [None]:
ds.printSchema()

In [None]:
ds.show()

In [None]:
ds.createOrReplaceGlobalTempView("payments")

In [None]:
ds.count()

## Querying and Exploring the data  

## Querying can be done in two ways. One by using Spark functions and other directly by writing SQl statements. Both the ways are used below. 

### Top 10 nature of accounts with payments by count  

In [None]:
from pyspark.sql.functions import desc
ds.groupBy(ds["Nature_of_Payment"]).count().orderBy(desc("count")).show(10)

### Nature of payments with payments  > $1000 with their counts

In [None]:
ds.filter(ds["amount"] > 1000).groupBy(ds["Nature_of_Payment"]).count().show()

### Top five Physicain specialites by total amount 

In [None]:
from pyspark.sql.functions import sum

In [None]:
spark.sql ("select physician_id , sum(amount) as revenue from global_temp.payments group by physician_id order by revenue desc limit 5").show() 

### Top 10  nature of payments by total amount

In [None]:
spark.sql("select Nature_of_payment , sum(amount) as total from global_temp.payments group by Nature_of_payment order by total desc limit 10").show()

## Average amount of payment  in each month 

In [None]:
from pyspark.sql.functions import format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format

In [None]:
ds.groupBy(month(ds['date_payment'])).mean().show()

## Installing pymongo 

In [None]:
import sys
!{sys.executable} -m pip install pymongo

In [None]:
ds.show()

## Converting the pyspark dataframe to RDD of JSON Object 

In [None]:

    
import json

results =ds.toJSON()
    

## Sample RDD 

In [None]:
results.first()

## As the dataset is too big, there are a lot of memory issues which we were facing. So we have decided to use only top 50000 rows. In a bigger environment, same method can be used for bigger datasets. 

In [None]:
ds2 = spark.sql("select * from global_temp.payments limit 50000")

In [None]:
ds2.show()

In [None]:
from pyspark import SparkContext, SparkConf

## Converting pyspark dataframe to pandas dataframe 

In [None]:
import pandas as pd

In [None]:
pdDf = ds2.toPandas()

## Sample pandas data frame 

In [None]:
pdDf.head()

## Creating a new index as physician_id+date_payment+Nature_of_payment so that it is easier to query and find records in the database

In [None]:
pdDf = pdDf.set_index([pdDf.physician_id+'_'+ pdDf.Nature_of_payment])

pdDf['date_payment'] = pdDf['date_payment'].astype(str)
pdDf = pdDf.set_index([pdDf.physician_id+'_'+'_'+pdDf.date_payment+'_'+ '_'+pdDf.Nature_of_payment])
jsonDict = pdDf.to_dict('index')

## Sample dataframe with the new index 

In [None]:
pdDf.head()

## Json Dictionary in the format we require to store in the Database
#### Format:
#### 'Index': {'physician_id': 'Value',
####          'date_payment': 'Value',
####          'record_id': 'Value'
####          'payer': 'Value',
####          'amount': Value,
####          'Physician_Specialty': 'Value',
####          'Nature_of_Payment': 'Value',
####               }
     

In [None]:
jsonDict

## Connecting to PyMongo 

In [None]:
client = MongoClient('localhost', 27017)

In [None]:
from pymongo import MongoClient
client = MongoClient()

## Inserting the JSON records into Mongo DB and printing the id of the insertion

In [None]:
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')

# data base name : 'test-database-1'
mydb = client['test-database-']

import datetime



record_id = mydb.mytable.insert(jsonDict)

print (record_id)
print (mydb.collection_names())