# NHSBSA English Prescribing Data (EPD) Analysis

##                            Part 1: Data Extraction and Transformation

API documentation https://docs.ckan.org/en/2.8/api/

In [None]:
#Import the requests module. This module is used to send HTTP requests in python.
import requests  

#The url below is described within the API documentation and is used to view the datasets available.
url = "http://opendata.nhsbsa.net/api/3/action/package_list"

#Response.get sends a get request to obtain data from a url.
response = requests.get(url)

#The status code below is used to ensure the request has/can be made successfully.
#Status code 200 means the request was successful.

response.status_code

In [None]:
#Display the response in dictionary format.
response.json()

In [None]:
#Print the values in the 'result' key.
#The 'result' key contains the names of datasets available.

#Only three datasets are available.
print(response.json()['result'])

In [None]:
#Using the API documentation, the url containing the medicines data was used and tested using response.status_code
url2 = "http://opendata.nhsbsa.net/api/3/action/package_show?id=english-prescribing-data-epd"
response2 = requests.get(url2)
response2_json = response2.json() 
response2.status_code

In [None]:
#The keys within the output of the results were investigated.
response2_json['result'].keys()

In [None]:
#Each dataset for medicines in secondary care is stored within csv files downloadable through a url.
response2_json['result']['resources'][0]['url']

In [None]:
datasets =[]
for num in range(len(response2_json['result']['resources'])):
    datasets.append(response2_json['result']['resources'][num]['url'])
#datasets

# Memory

2019's prescribing data can now be found in datasets[60:72], however, an attempt to read all of the CSVs into memory using Pandas will result in wastage of hours followed by a crash.

An option is to read the CSVs in chunks, however, I soon learnt that there just is not enough memory to complete the task in a manner which is efficient and allows for reproduction. My Macbook possesses a meagre 8GB of memory. "Restart and Run All Cells" became synonymous with "waste the whole weekend".

The solution? Read only the features and records which are required. There is also a need to investigate the data types which take up less memory and apply necessary changes to features whilst reading.

In [None]:
#Here, an entire CSV (one month) is read to memory. The size can be seen below.


import pandas as pd
test_df = pd.read_csv(datasets[60], sep =",")

In [None]:
test_df.info(memory_usage='deep', null_counts=True, verbose=True)

In [None]:
#test_df.memory_usage(deep=True)

In [None]:
#Let's free up that memory!

del test_df

As can be seen above, a full reading of only one CSV assumes  MB of memory. This is not efficient. Below a function which reads a list of urls in Pandas whilst preparing the data to use memory effificiently is defined.

In [None]:
from datetime import datetime

def LargeCSVsChop(listofCSV_URLs):
    
    
    dfs = []
    for num in range(len(listofCSV_URLs)):

        feats = ['YEAR_MONTH', 'PCO_NAME', 'PRACTICE_NAME', 'CHEMICAL_SUBSTANCE_BNF_DESCR', 
                'BNF_DESCRIPTION', 'TOTAL_QUANTITY', 'ACTUAL_COST']
        
        custom_date_parser = lambda x: datetime.strptime(x, "%Y%m")
        
        df = pd.read_csv(listofCSV_URLs[num], sep =",", usecols = feats, date_parser=custom_date_parser,
                                                                         parse_dates=['YEAR_MONTH'])
        
        """"
        'feats' is a list of the columns/features from the CSV that I want Pandas to read to memory.
        
        types = {'TOTAL_QUANTITY': int, 'ACTUAL_COST':float}
        
        Types is a dictionary of columns and the datatypes they should be read as using dtypes = types in the read_csv function. 
        
        The types are optimal so no changes will be made but the dictionary can be used in future.    
        
        
        'custom_data_parser' will be used to convert the dtype of the YEAR_MONTH feature to date format.

        
        
        The dataframe will read only feats and convert the dtype of the YEAR_MONTH feature.
        
        'df' is used as a variable again for the filtered dataframe to prompt the deallocation of the unfiltered
        #dataframe object from memory by dropping the reference count to zero and prompting garbage collection algorithms.
        
        
        Oral antihistamines will be the focus of this analysis. 'CHEMICAL_SUBSTANCE_BNF_DESCR' is used to ensure all
        forms and brads of the drugs are captured. Further work is done using 'BNF_DESCRIPTION' to remove unwanted products.
        
        """
        
        df = df[ 
               (df['CHEMICAL_SUBSTANCE_BNF_DESCR']=='Cetirizine hydrochloride')                              | 
               (df['CHEMICAL_SUBSTANCE_BNF_DESCR']=='Loratadine')                                            |
               (df['CHEMICAL_SUBSTANCE_BNF_DESCR']=='Desloratadine')                                         |
               (df['CHEMICAL_SUBSTANCE_BNF_DESCR']=='Fexofenadine hydrochloride')                            |
               (df['CHEMICAL_SUBSTANCE_BNF_DESCR']=='Acrivastine')                                           |
               (df['CHEMICAL_SUBSTANCE_BNF_DESCR']=='Bilastine')                                             |
               (df['CHEMICAL_SUBSTANCE_BNF_DESCR']=='Levocetirizine')                                        |
               (df['CHEMICAL_SUBSTANCE_BNF_DESCR']=='Mizolastine')                                           |
               (df['CHEMICAL_SUBSTANCE_BNF_DESCR']=='Chlorphenamine maleate')                                & 
               (df['BNF_DESCRIPTION']!='Chlorphenamine 10mg/1ml solution for injection ampoules')            |
               (df['CHEMICAL_SUBSTANCE_BNF_DESCR']=='Promethazine hydrochloride')                            &
               (df['BNF_DESCRIPTION']!='Promethazine 25mg/1ml solution for injection ampoules')              &
               (df['BNF_DESCRIPTION']!='Phenergan 25mg/1ml solution for injection ampoules')                   
             ]
      
        dfs.append(df)
        
    df_large = pd.concat(dfs)
        
    return df_large

One dataframe read this way uses a lot less memory (13MB) but another huge problem now exists. 

In [None]:
Jan_2019_df = LargeCSVsChop(datasets[60:61])

In [None]:
Jan_2019_df.info(memory_usage='deep', null_counts=True, verbose=True)

# Time

The issue of time is partially solved by reading only the features required for the dataframe as demonstrated below. Hoever, bandwidth is another limitation, this work requires pandas to parce a CSV though a URL; low speeds will ensure the process is slow regardless of how (in)efficient this process is for Pandas.

In [None]:
import time
import numpy as np

def timefunc(function, arg, repeats = 20):
    
    alltime = []
    
    while  repeats > 0:
        
        """
        Unless specified the number of repitions will be 20.
        
        
        """
        
        starttime= time.time() # record the start time

        result = function(arg) # run the function and store in the variable 'result' in case result is needed.

        endtime = time.time() # Record end time.

        timetaken = endtime - starttime 
        
        alltime.append(timetaken) 
        
        repeats -=1  
        
    mean = np.mean(alltime) #Find the mean.
    std = np.std(alltime) #Find the standard deviation.
    error=std/(len(alltime)**0.5)   #Find the standard error.
    
    return (mean)

In [None]:
def FuncReadCSV(listofdataframes):
    
    frames = []
    for csv in listofdataframes:
        df = pd.read_csv(csv)
        frames.append(df)
        dfs= pd.concat(frames)
    return df

Without_ncols = timefunc(FuncReadCSV, datasets[60:62], repeats = 1)

#Return the mean timetaken to run read a list of CSVs without removing columns or filtering records.

In [None]:
def LargeCSVsChop_MINI(listofCSV_URLs):
    
    """
    The same as LargeCSVsChop without the filtering element. This is to enable like for like
    comparisons as much as possible (with the exception of date parsing) as it pertains to time

    """
    dfs = []
    for num in range(len(listofCSV_URLs)):

        feats = ['YEAR_MONTH', 'PCO_NAME', 'PRACTICE_NAME', 'CHEMICAL_SUBSTANCE_BNF_DESCR', 
                'BNF_DESCRIPTION', 'TOTAL_QUANTITY', 'ACTUAL_COST']
        
        custom_date_parser = lambda x: datetime.strptime(x, "%Y%m")
        
        df = pd.read_csv(listofCSV_URLs[num], sep =",", usecols = feats, date_parser=custom_date_parser,
                                                                         parse_dates=['YEAR_MONTH'])
      
        dfs.append(df)
        
    df_large = pd.concat(dfs)
        
    return df_large



With_ncols = timefunc(LargeCSVsChop_MINI, datasets[60:62], repeats = 1)

#Return the mean time taken to read one CSV using the LargeCSVsChop_MINI function.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig,ax1=plt.subplots(figsize=(10,6))

plt.xlabel('CSV Reading Methods')
plt.ylabel('Mean Time Taken to Read (minutes)')
plt.title('Comparison of Time Taken to Read Two CSVs ')

plt.yticks(range(100))
            
plt.bar(height=Without_ncols/60,x = 'Without Function', color = 'red')
plt.bar(height=With_ncols/60,x='LargeCSVsChop_MINI', color = 'blue')
plt.savefig('Time taken')

#Bearing in mind that the LargeCSVsChop function doesn't just read to dataframe. It also parses the YEAR_MONTH column to

### Whole year of Data

Each csv is read in individual lines as opposed to one to allow for fleixibility when the kernel needs to be interrupted due to the time taken or any other issue.

In [None]:
Feb_2019_df = LargeCSVsChop(datasets[61:62])

In [None]:
Mar_2019_df = LargeCSVsChop(datasets[62:63])

In [None]:
Apr_2019_df = LargeCSVsChop(datasets[63:64])

In [None]:
May_2019_df = LargeCSVsChop(datasets[64:65])

In [None]:
Jun_2019_df = LargeCSVsChop(datasets[65:66])

In [None]:
Jul_2019_df = LargeCSVsChop(datasets[66:67])

In [None]:
Aug_2019_df = LargeCSVsChop(datasets[67:68])

In [None]:
Sep_2019_df = LargeCSVsChop(datasets[68:69])

In [None]:
Oct_2019_df = LargeCSVsChop(datasets[69:70])

In [None]:
Nov_2019_df = LargeCSVsChop(datasets[70:71])

In [None]:
Dec_2019_df = LargeCSVsChop(datasets[71:72])

In [None]:
HayFev_df_2019_list = [Jan_2019_df,Feb_2019_df ,Mar_2019_df,Apr_2019_df,May_2019_df,Jun_2019_df,Jul_2019_df,
                       Aug_2019_df,Sep_2019_df,Oct_2019_df,Nov_2019_df,Dec_2019_df ]

HayFev_df_2019 = pd.concat(HayFev_df_2019_list)

HayFev_df_2019.to_csv('HayFev_df_2019.csv') 

In [None]:
#HayFev_df_2019.info()

Given the cost of hardware, I am more inclined to look for workarounds such as these to facilitate and speed up personal projects. I hope this has been useful to others and I would love comments and feedback on my code. 

