In [1]:
import pandas as pd
from functools import reduce
import numpy as np
import gc

The pandas dataframes we are working with are very large, to make them easier to work with we will do a variety of tricks to shrink their memory footprint.

In [2]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [38]:
def reduce_mem_usage(df, make_sparse=False):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of dataframe is :",start_mem_usg," MB")
    # First drop empty columns
    df.dropna(axis=1,how='all', inplace=True)
    for col in df.columns:
        # Print current column type
        print("******************************")
        print("Column: ",col)
        print("dtype before: ",df[col].dtype)
        if str(df[col].dtype) in ["object", "string"]:
            print(df[col].nunique(), len(df[col]), (df[col].nunique() / len(df[col])))
            if (df[col].nunique() / len(df[col])) < 0.5:
                df.loc[:,col] = df[col].astype('category')
        elif str(df[col].dtype).lower() == "int64": 
            # Make Integer/unsigned Integer datatypes
            mx = df[col].max()
            mn = df[col].min()
            try:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
            except ValueError:
                df[col] = df[col].astype("Int64")
                # Make float datatypes 32 bit
        elif str(df[col].dtype) == "float64":
            df[col] = df[col].astype(np.float32)
            
        if (len(df[col].dropna()) / len(df[col]) < 0.25) and make_sparse:
            df[col] = pd.arrays.SparseArray(df[col], dtype = df[col].dtype)
        # Print new column type
        print("dtype after: ",df[col].dtype)
        print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100 * mem_usg / start_mem_usg,"% of the initial size")
    return df

## Fact Data
Some previous data exploration, checking for nans, num unique values, etc. has been omitted for brevity.

In [39]:
# Load all the raw data queried from the db
fact_blood_product = pd.read_feather("../data/raw/fact_blood_product.feather")
fact_donation = pd.read_feather("../data/raw/fact_donation.feather")
fact_exception = pd.read_feather("../data/raw/fact_exception.feather")
fact_run = pd.read_feather("../data/raw/fact_run.feather")

As of right now we don't know if some columns that share a name represent the same information.
So for now going to rename the columns to keep the information seperate.

In [40]:
fact_blood_product.rename(columns={
    "number_of_units_processed":"number_of_blood_product_units_processed"}, inplace=True)
fact_donation.rename(columns={
    "number_of_units_processed":"number_of_donation_units_processed",
    "number_of_duplicated_units":"number_of_duplicated_donation_units",
    "number_of_skipped_barcodes":"number_of_skipped_donation_barcodes",
    "number_of_alarms":"number_of_donation_alarms",}, inplace=True)
fact_exception.rename(columns={
    "number_of_units_processed":"number_of_donation_units_processed",
    "number_of_duplicated_units":"number_of_duplicated_exception_units",
    "number_of_skipped_barcodes":"number_of_skipped_exception_barcodes",
    "number_of_alarms":"number_of_exception_alarms",}, inplace=True)
fact_run.rename(columns={
    "number_of_barcodes_skipped":"number_of_skipped_run_barcodes"}, inplace=True)

### fact_blood_product & dim_blood_product

In [41]:
fact_blood_product = fact_blood_product.convert_dtypes()
fact_blood_product.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17971048 entries, 0 to 17971047
Data columns (total 11 columns):
 #   Column                                   Non-Null Count     Dtype 
---  ------                                   --------------     ----- 
 0   dim_run_date                             17971048 non-null  object
 1   dim_device_id                            17971048 non-null  string
 2   dim_donation_id                          17971048 non-null  string
 3   dim_custom_data_id                       0 non-null         object
 4   dim_run_id                               17971048 non-null  string
 5   dim_facility_id                          17971048 non-null  string
 6   dim_configuration_id                     17971048 non-null  string
 7   dim_operator_id                          15730738 non-null  string
 8   dim_blood_product_id                     17971048 non-null  string
 9   product_volume                           17971048 non-null  Int64 
 10  number_of_blood_

In [42]:
# fact_blood_product has a column with all NaNs get rid of it
fact_blood_product.dropna(axis=1,how='all', inplace=True)

Now lets replace the dim_blood_product_id with the actuall blood products, this is essentially just a string replacement.

In [43]:
dim_blood_product = pd.read_feather("../data/raw/dim_blood_product.feather")
blood_products = dim_blood_product.set_index('dim_blood_product_id')['blood_product_type'].to_dict()
fact_blood_product["blood_product"] = fact_blood_product["dim_blood_product_id"].replace(blood_products)
fact_blood_product.drop(columns=['dim_blood_product_id'], inplace = True)
del dim_blood_product
gc.collect()

106

In [44]:
fact_blood_product.head()

Unnamed: 0,dim_run_date,dim_device_id,dim_donation_id,dim_run_id,dim_facility_id,dim_configuration_id,dim_operator_id,product_volume,number_of_blood_product_units_processed,blood_product
0,2015-01-19,9C412C1A-1127-EB11-80C7-005056AF16E2,5726EF30-1127-EB11-80C7-005056AF16E2,7C3DF18D-1127-EB11-80C7-005056AF16E2,290D989C-1127-EB11-80C7-005056AF16E2,DD0F989C-1127-EB11-80C7-005056AF16E2,C9B35BA5-1127-EB11-80C7-005056AF16E2,94,1,Platelet_Yield_Index
1,2015-01-19,9C412C1A-1127-EB11-80C7-005056AF16E2,5726EF30-1127-EB11-80C7-005056AF16E2,7C3DF18D-1127-EB11-80C7-005056AF16E2,290D989C-1127-EB11-80C7-005056AF16E2,DD0F989C-1127-EB11-80C7-005056AF16E2,C9B35BA5-1127-EB11-80C7-005056AF16E2,64,1,Platelet_Volume
2,2015-01-19,9C412C1A-1127-EB11-80C7-005056AF16E2,5726EF30-1127-EB11-80C7-005056AF16E2,7C3DF18D-1127-EB11-80C7-005056AF16E2,290D989C-1127-EB11-80C7-005056AF16E2,DD0F989C-1127-EB11-80C7-005056AF16E2,C9B35BA5-1127-EB11-80C7-005056AF16E2,9,1,Leukocyte_Volume
3,2015-01-19,9C412C1A-1127-EB11-80C7-005056AF16E2,5726EF30-1127-EB11-80C7-005056AF16E2,7C3DF18D-1127-EB11-80C7-005056AF16E2,290D989C-1127-EB11-80C7-005056AF16E2,DD0F989C-1127-EB11-80C7-005056AF16E2,C9B35BA5-1127-EB11-80C7-005056AF16E2,180,1,Plasma_Volume
4,2015-01-22,9C412C1A-1127-EB11-80C7-005056AF16E2,3E272D31-1127-EB11-80C7-005056AF16E2,75347281-1127-EB11-80C7-005056AF16E2,290D989C-1127-EB11-80C7-005056AF16E2,DD0F989C-1127-EB11-80C7-005056AF16E2,BAB75BA5-1127-EB11-80C7-005056AF16E2,9,1,Leukocyte_Volume


Now lets clean up the datatypes

In [45]:
fact_blood_product = reduce_mem_usage(fact_blood_product)

Memory usage of dataframe is : 1405.3593292236328  MB
******************************
Column:  dim_run_date
dtype before:  object
2756 17971048 0.00015335777857807735
dtype after:  category
******************************
******************************
Column:  dim_device_id
dtype before:  string
1595 17971048 8.87538667750484e-05
dtype after:  category
******************************
******************************
Column:  dim_donation_id
dtype before:  string
4907126 17971048 0.2730573086221794
dtype after:  category
******************************
******************************
Column:  dim_run_id
dtype before:  string
1307580 17971048 0.07276036433712714
dtype after:  category
******************************
******************************
Column:  dim_facility_id
dtype before:  string
329 17971048 1.8307223930401832e-05
dtype after:  category
******************************
******************************
Column:  dim_configuration_id
dtype before:  string
4145 17971048 0.0002306487635000

In [46]:
fact_blood_product.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17971048 entries, 0 to 17971047
Data columns (total 10 columns):
 #   Column                                   Non-Null Count     Dtype   
---  ------                                   --------------     -----   
 0   dim_run_date                             17971048 non-null  category
 1   dim_device_id                            17971048 non-null  category
 2   dim_donation_id                          17971048 non-null  category
 3   dim_run_id                               17971048 non-null  category
 4   dim_facility_id                          17971048 non-null  category
 5   dim_configuration_id                     17971048 non-null  category
 6   dim_operator_id                          15730738 non-null  category
 7   product_volume                           17971048 non-null  uint16  
 8   number_of_blood_product_units_processed  17971048 non-null  uint8   
 9   blood_product                            17971048 non-null  catego

In [47]:
fact_blood_product.to_feather("../data/interim/fact_blood_product.feather")

## fact_donation

In [49]:
fact_donation = fact_donation.convert_dtypes()
fact_donation.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5318940 entries, 0 to 5318939
Data columns (total 13 columns):
 #   Column                               Non-Null Count    Dtype 
---  ------                               --------------    ----- 
 0   dim_run_date                         5318940 non-null  object
 1   dim_device_id                        5318940 non-null  string
 2   dim_donation_id                      5318940 non-null  string
 3   dim_custom_data_id                   5283316 non-null  string
 4   dim_run_id                           5318940 non-null  string
 5   dim_facility_id                      5318940 non-null  string
 6   dim_configuration_id                 5318940 non-null  string
 7   dim_operator_id                      4635496 non-null  string
 8   number_of_donation_units_processed   5318940 non-null  Int64 
 9   number_of_duplicated_donation_units  5318940 non-null  Int64 
 10  number_of_skipped_donation_barcodes  5318940 non-null  Int64 
 11  number_of_d

In [50]:
fact_donation = reduce_mem_usage(fact_donation)

Memory usage of dataframe is : 552.9065971374512  MB
******************************
Column:  dim_run_date
dtype before:  object
2779 5318940 0.0005224725227206924
dtype after:  category
******************************
******************************
Column:  dim_device_id
dtype before:  string
2014 5318940 0.00037864687324918125
dtype after:  category
******************************
******************************
Column:  dim_donation_id
dtype before:  string
5318940 5318940 1.0
dtype after:  string
******************************
******************************
Column:  dim_custom_data_id
dtype before:  string
5283316 5318940 0.9933024249192508
dtype after:  string
******************************
******************************
Column:  dim_run_id
dtype before:  string
1329735 5318940 0.25
dtype after:  category
******************************
******************************
Column:  dim_facility_id
dtype before:  string
353 5318940 6.636660688031826e-05
dtype after:  category
****************

In [51]:
fact_donation.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5318940 entries, 0 to 5318939
Data columns (total 13 columns):
 #   Column                               Non-Null Count    Dtype   
---  ------                               --------------    -----   
 0   dim_run_date                         5318940 non-null  category
 1   dim_device_id                        5318940 non-null  category
 2   dim_donation_id                      5318940 non-null  string  
 3   dim_custom_data_id                   5283316 non-null  string  
 4   dim_run_id                           5318940 non-null  category
 5   dim_facility_id                      5318940 non-null  category
 6   dim_configuration_id                 5318940 non-null  category
 7   dim_operator_id                      4635496 non-null  category
 8   number_of_donation_units_processed   5318940 non-null  uint8   
 9   number_of_duplicated_donation_units  5318940 non-null  uint8   
 10  number_of_skipped_donation_barcodes  5318940 non-null 

In [52]:
fact_donation.to_feather("../data/interim/fact_donation.feather")

## fact_exception

In [53]:
fact_exception = fact_exception.convert_dtypes()
fact_exception.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599267 entries, 0 to 599266
Data columns (total 14 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   dim_run_date                            599267 non-null  object
 1   dim_device_id                           599267 non-null  string
 2   dim_run_id                              599267 non-null  string
 3   dim_facility_id                         599267 non-null  string
 4   dim_configuration_id                    599267 non-null  string
 5   dim_operator_id                         477562 non-null  string
 6   dim_exception_id                        599267 non-null  string
 7   number_of_duplicated_exception_units    599267 non-null  Int64 
 8   number_of_reprocessed_units             599267 non-null  Int64 
 9   number_of_inconclusive_sealed_units     599267 non-null  Int64 
 10  number_of_unrecognized_barcodes         599267 non-null 

In [54]:
fact_exception = reduce_mem_usage(fact_exception)

Memory usage of dataframe is : 68.00928211212158  MB
******************************
Column:  dim_run_date
dtype before:  object
2775 599267 0.004630657119447591
dtype after:  category
******************************
******************************
Column:  dim_device_id
dtype before:  string
1946 599267 0.003247300452052257
dtype after:  category
******************************
******************************
Column:  dim_run_id
dtype before:  string
314787 599267 0.5252867252827204
dtype after:  string
******************************
******************************
Column:  dim_facility_id
dtype before:  string
348 599267 0.000580709433357752
dtype after:  category
******************************
******************************
Column:  dim_configuration_id
dtype before:  string
4955 599267 0.008268434604274889
dtype after:  category
******************************
******************************
Column:  dim_operator_id
dtype before:  string
2796 599267 0.00466569993008125
dtype after:  catego

In [55]:
fact_exception.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599267 entries, 0 to 599266
Data columns (total 14 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   dim_run_date                            599267 non-null  category
 1   dim_device_id                           599267 non-null  category
 2   dim_run_id                              599267 non-null  string  
 3   dim_facility_id                         599267 non-null  category
 4   dim_configuration_id                    599267 non-null  category
 5   dim_operator_id                         477562 non-null  category
 6   dim_exception_id                        599267 non-null  string  
 7   number_of_duplicated_exception_units    599267 non-null  uint8   
 8   number_of_reprocessed_units             599267 non-null  uint8   
 9   number_of_inconclusive_sealed_units     599267 non-null  uint8   
 10  number_of_unrecognized_barcodes 

In [56]:
fact_exception.to_feather("../data/interim/fact_exception.feather")

## fact_run

In [57]:
fact_run = fact_run.convert_dtypes()
fact_run.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1329735 entries, 0 to 1329734
Data columns (total 21 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   dim_run_date                     1329735 non-null  object 
 1   dim_device_id                    1329735 non-null  string 
 2   dim_run_id                       1329735 non-null  string 
 3   dim_facility_id                  1329735 non-null  string 
 4   dim_configuration_id             1329735 non-null  string 
 5   dim_operator_id                  1158874 non-null  string 
 6   procedure_duration_minutes       1329735 non-null  float64
 7   idle_time_duration_minutes       1329735 non-null  float64
 8   load_duration_minutes            1329735 non-null  float64
 9   unload_duration_minutes          1329735 non-null  float64
 10  load_idle_time_duration_minutes  1329735 non-null  float64
 11  run_duration_minutes             1329735 non-null 

In [58]:
fact_run = reduce_mem_usage(fact_run)

Memory usage of dataframe is : 218.11919021606445  MB
******************************
Column:  dim_run_date
dtype before:  object
2779 1329735 0.0020898900908827697
dtype after:  category
******************************
******************************
Column:  dim_device_id
dtype before:  string
2014 1329735 0.001514587492996725
dtype after:  category
******************************
******************************
Column:  dim_run_id
dtype before:  string
1329735 1329735 1.0
dtype after:  string
******************************
******************************
Column:  dim_facility_id
dtype before:  string
353 1329735 0.00026546642752127306
dtype after:  category
******************************
******************************
Column:  dim_configuration_id
dtype before:  string
5747 1329735 0.00432191376477268
dtype after:  category
******************************
******************************
Column:  dim_operator_id
dtype before:  string
3245 1329735 0.0024403358563924392
dtype after:  category


In [59]:
fact_run.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1329735 entries, 0 to 1329734
Data columns (total 21 columns):
 #   Column                           Non-Null Count    Dtype   
---  ------                           --------------    -----   
 0   dim_run_date                     1329735 non-null  category
 1   dim_device_id                    1329735 non-null  category
 2   dim_run_id                       1329735 non-null  string  
 3   dim_facility_id                  1329735 non-null  category
 4   dim_configuration_id             1329735 non-null  category
 5   dim_operator_id                  1158874 non-null  category
 6   procedure_duration_minutes       1329735 non-null  float32 
 7   idle_time_duration_minutes       1329735 non-null  float32 
 8   load_duration_minutes            1329735 non-null  float32 
 9   unload_duration_minutes          1329735 non-null  float32 
 10  load_idle_time_duration_minutes  1329735 non-null  float32 
 11  run_duration_minutes             1329

In [60]:
fact_run.to_feather("../data/interim/fact_run.feather")

## Merging fact data

In [63]:
interim_fact_data_1 = pd.merge(fact_blood_product, fact_donation, how='outer',
                    left_on = ['dim_run_date', 'dim_device_id', 'dim_donation_id', 'dim_run_id', 'dim_facility_id', 'dim_configuration_id', 'dim_operator_id'], 
                    right_on = ['dim_run_date', 'dim_device_id', 'dim_donation_id', 'dim_run_id', 'dim_facility_id', 'dim_configuration_id', 'dim_operator_id']) 

del fact_blood_product
del fact_donation
gc.collect()

90

In [64]:
interim_fact_data_2 = pd.merge(fact_exception, fact_run, how='outer',
                    left_on = ['dim_run_date', 'dim_device_id', 'dim_run_id', 'dim_facility_id', 'dim_configuration_id', 'dim_operator_id'], 
                    right_on = ['dim_run_date', 'dim_device_id', 'dim_run_id', 'dim_facility_id', 'dim_configuration_id', 'dim_operator_id']) 

del fact_exception
del fact_run
gc.collect()

15

In [65]:
fact_data = pd.merge(interim_fact_data_1, interim_fact_data_2, how='outer',
                    left_on = ['dim_run_date', 'dim_device_id', 'dim_run_id', 'dim_facility_id', 'dim_configuration_id', 'dim_operator_id'], 
                    right_on = ['dim_run_date', 'dim_device_id', 'dim_run_id', 'dim_facility_id', 'dim_configuration_id', 'dim_operator_id']) 

del interim_fact_data_1
del interim_fact_data_2
gc.collect()

15

In [67]:
fact_data = fact_data.convert_dtypes()
fact_data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21732284 entries, 0 to 21732283
Data columns (total 39 columns):
 #   Column                                   Non-Null Count     Dtype         
---  ------                                   --------------     -----         
 0   dim_run_date                             21732284 non-null  datetime64[ns]
 1   dim_device_id                            21732284 non-null  string        
 2   dim_donation_id                          21732284 non-null  string        
 3   dim_run_id                               21732284 non-null  string        
 4   dim_facility_id                          21732284 non-null  string        
 5   dim_configuration_id                     21732284 non-null  string        
 6   dim_operator_id                          18741299 non-null  string        
 7   product_volume                           21027193 non-null  Int64         
 8   number_of_blood_product_units_processed  21027193 non-null  Int64         
 9   

In [68]:
fact_data = reduce_mem_usage(fact_data)
fact_data.to_feather("../data/interim/fact_data.feather")

Memory usage of dataframe is : 4642.5170822143555  MB
******************************
Column:  dim_run_date
dtype before:  datetime64[ns]
dtype after:  datetime64[ns]
******************************
******************************
Column:  dim_device_id
dtype before:  string
2014 21732284 9.267318612254469e-05
dtype after:  category
******************************
******************************
Column:  dim_donation_id
dtype before:  string
5318940 21732284 0.24474832005692546
dtype after:  category
******************************
******************************
Column:  dim_run_id
dtype before:  string
1329735 21732284 0.061187080014231364
dtype after:  category
******************************
******************************
Column:  dim_facility_id
dtype before:  string
353 21732284 1.6243115541836285e-05
dtype after:  category
******************************
******************************
Column:  dim_configuration_id
dtype before:  string
5747 21732284 0.0002644452833397539
dtype after:  ca

In [69]:
fact_data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21732284 entries, 0 to 21732283
Data columns (total 39 columns):
 #   Column                                   Non-Null Count     Dtype         
---  ------                                   --------------     -----         
 0   dim_run_date                             21732284 non-null  datetime64[ns]
 1   dim_device_id                            21732284 non-null  category      
 2   dim_donation_id                          21732284 non-null  category      
 3   dim_run_id                               21732284 non-null  category      
 4   dim_facility_id                          21732284 non-null  category      
 5   dim_configuration_id                     21732284 non-null  category      
 6   dim_operator_id                          18741299 non-null  category      
 7   product_volume                           21027193 non-null  Int64         
 8   number_of_blood_product_units_processed  21027193 non-null  Int64         
 9   

## Restart Kernal
Do this to free up memory

In [None]:
fact_data = pd.read_feather("../data/interim/fact_data.feather")

## dim_configuration

In [None]:
dim_configuration = pd.read_feather("../data/raw/dim_configuration.feather")
dim_configuration["configuration_status"] = dim_configuration["configuration_status"].astype("category")
dim_configuration.info()

In [None]:
fact_data = fact_data.merge(dim_configuration, how='outer',
                    left_on = ['dim_configuration_id'], 
                    right_on = ['dim_configuration_id']) 
fact_data.drop(columns=['dim_configuration_id'], inplace = True)

In [None]:
fact_data.info()

## dim_device

In [None]:
dim_device = pd.read_feather("../data/raw/dim_device.feather")
# drop device_serial_number, device_name, and device_type_name(which is only 'REVEOS') I don't imagine they contain anything useful.
dim_device.drop(columns=['device_serial_number', 'device_name', 'device_type_name'], inplace = True)
dim_device["device_software_version"] = dim_device["device_software_version"].astype("category")
dim_device["device_language_name"] = dim_device["device_language_name"].astype("category")
dim_device.info()

In [None]:
fact_data = fact_data.merge(dim_device, how='outer',
                    left_on = ['dim_device_id'], 
                    right_on = ['dim_device_id']) 
fact_data.drop(columns=['dim_device_id'], inplace = True)
fact_data.info()

## dim_donation

In [None]:
dim_donation = pd.read_feather("../data/raw/dim_donation.feather")
dim_donation["donation_status"] = dim_donation["donation_status"].astype("category")
#dim_donation["bucket_number"] = dim_donation["bucket_number"].astype("category")
dim_donation["unit_number_lifetime_status"] = dim_donation["unit_number_lifetime_status"].astype("category")
dim_donation["welding_status"] = dim_donation["welding_status"].astype("category")
dim_donation.info()

In [None]:
fact_data = fact_data.merge(dim_donation, how='outer',
                    left_on = ['dim_donation_id'], 
                    right_on = ['dim_donation_id']) 
fact_data.drop(columns=['dim_donation_id'], inplace = True)
fact_data.info()

## dim_exception

In [None]:
dim_exception = pd.read_feather("../data/raw/dim_exception.feather")
#dim_exception["bucket_number"] = dim_exception["bucket_number"].astype("category")
dim_exception["exception_type"] = dim_exception["exception_type"].astype("category")
dim_exception["exception_state"] = dim_exception["exception_state"].astype("category")
dim_exception.drop(columns=['run_data_message_entry_id'], inplace = True)
dim_exception.info()

In [None]:
fact_data = fact_data.merge(dim_exception, how='outer',
                    left_on = ['dim_exception_id', 'bucket_number', 'run_datetime'], 
                    right_on = ['dim_exception_id', 'bucket_number', 'run_datetime']) 
fact_data.drop(columns=['dim_exception_id'], inplace = True)
fact_data.info()

## dim_facility
I am omitting this for now, facility names are almost unique and I don't see them being more helpful than the dim_faciliy_id. Also not sure what the facility time_zone will add.

## dim_operator

In [None]:
dim_operator = pd.read_feather("../data/raw/dim_operator.feather")
dim_operator.info()

In [None]:
fact_data = fact_data.merge(dim_operator, how='outer',
                    left_on = ['dim_operator_id', 'operator_id'], 
                    right_on = ['dim_operator_id', 'operator_id']) 
fact_data.drop(columns=['dim_operator_id'], inplace = True)
fact_data.info()

## dim_run

In [None]:
dim_run = pd.read_feather("../data/raw/dim_run.feather")
# dim_run has a column with all NaNs get rid of it
dim_run.dropna(axis=1,how='all', inplace=True)
dim_run.drop(columns=['file_name'], inplace = True)
dim_run.info()

In [None]:
fact_data = fact_data.merge(dim_run, how='outer',
                    left_on = ['dim_run_id', 'operator_id'], 
                    right_on = ['dim_run_id', 'operator_id']) 
fact_data.drop(columns=['dim_run_id'], inplace = True)
fact_data.info()

Fix some remaining datatypes and general cleanup of the dataframe

In [None]:
fact_data["bucket_number"] = fact_data["bucket_number"].astype('category')

In [None]:
fact_data = reduce_mem_usage(fact_data)

In [None]:
fact_data.to_feather("../data/interim/fact_data.feather")

## Restart Kernal

In [None]:
dim_custom_data_01 = pd.read_feather("../data/raw/dim_custom_data_01.feather")
dim_custom_data_02 = pd.read_feather("../data/raw/dim_custom_data_02.feather")
dim_custom_data_03 = pd.read_feather("../data/raw/dim_custom_data_03.feather")
dim_custom_data_04 = pd.read_feather("../data/raw/dim_custom_data_04.feather")
dim_custom_flag = pd.read_feather("../data/raw/dim_custom_flag.feather")

In [None]:
dim_custom_data_01.iloc[:,2:] = dim_custom_data_01.iloc[:,2:].apply(pd.to_numeric, errors='raise')
dim_custom_data_01 = dim_custom_data_01.convert_dtypes()
dim_custom_data_01 = reduce_mem_usage(dim_custom_data_01)

In [None]:
dim_custom_data_02.iloc[:,2:] = dim_custom_data_02.iloc[:,2:].apply(pd.to_numeric, errors='raise')
dim_custom_data_02 = dim_custom_data_02.convert_dtypes()
dim_custom_data_02 = reduce_mem_usage(dim_custom_data_02)

In [None]:
dim_custom_data_03.iloc[:,2:] = dim_custom_data_03.iloc[:,2:].apply(pd.to_numeric, errors='raise')
dim_custom_data_03 = dim_custom_data_03.convert_dtypes()
dim_custom_data_03 = reduce_mem_usage(dim_custom_data_03)

In [None]:
dim_custom_data_04.iloc[:,2:] = dim_custom_data_04.iloc[:,2:].apply(pd.to_numeric, errors='raise')
dim_custom_data_04 = dim_custom_data_04.convert_dtypes()
dim_custom_data_04 = reduce_mem_usage(dim_custom_data_04)

In [None]:
dim_custom_flag.iloc[:,2:] = dim_custom_flag.iloc[:,2:].apply(pd.to_numeric, errors='raise')
dim_custom_flag = dim_custom_flag.convert_dtypes()
dim_custom_flag = reduce_mem_usage(dim_custom_flag)

In [None]:
custom_data = reduce(
                lambda x, y: pd.merge(
                    x, 
                    y,
                    how='outer',
                    left_on = ['run_data_donation_id', 'dim_custom_data_id'], 
                    right_on = ['run_data_donation_id', 'dim_custom_data_id']), 
                    [dim_custom_data_01, dim_custom_data_02, dim_custom_data_03, dim_custom_data_04, dim_custom_flag]
                )
custom_data.drop(columns=['run_data_donation_id'], inplace = True)
custom_data = reduce_mem_usage(custom_data)
custom_data.to_feather("../data/interim/custom_data.feather")

### Restart Kernel

Next lets merge *all* the data. We need to make the dataframes as small as possible because they will be huge in memory when pandas tries to merge them. To that end we're going to make everything a sparse category data type. We will go back and fix them after the merge.

In [None]:
custom_data = pd.read_feather("../data/interim/custom_data.feather")
fact_data = pd.read_feather("../data/interim/fact_data.feather")

In [None]:
for col in custom_data.columns[1:]:
    custom_data[col] = custom_data[col].astype('category')

In [None]:
for col in fact_data.columns:
    if col != "dim_custom_data_id":
        fact_data[col] = fact_data[col].astype('category')

In [None]:
final_data = pd.merge(fact_data, custom_data, how='outer',
                    left_on = ['dim_custom_data_id'], 
                    right_on = ['dim_custom_data_id']) 
final_data.drop(columns=['dim_custom_data_id'], inplace = True)
fact_data = None
cusomtom_data = None

In [None]:
final_data.to_feather("../data/interim/fact_and_custom_data.feather")

In [None]:
final_data.info(verbose=True)

In [None]:
final_data["product_volume"].describe()

In [None]:
final_data["product_volume"].unique()

In [None]:
np.array_equal(final_data["product_volume"].astype(float), final_data["product_volume"].astype("Int64"))

In [None]:
63314271 / final_data.shape[0]

In [None]:
final_data["run_date"] = pd.to_datetime(final_data["dim_run_date"], infer_datetime_format=True)
final_data.drop(columns=['dim_run_date'], inplace = True)

In [None]:
final_data["run_date"] = pd.arrays.SparseArray(final_data["run_date"], dtype = final_data["run_date"].dtype)

In [None]:
final_data.convert_dtypes()

In [None]:
final_data.head()