# Merging the data from CoverMyMeds 

 We have some data from CoverMyMeds, which are stored in different .csv files. I am going to merge these data to a data frame in order to analize the data.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
claims = pd.read_csv('../data/initial_data/dim_claims.csv')

In [7]:
claims

Unnamed: 0,dim_claim_id,bin,drug,reject_code,pharmacy_claim_approved
0,1,417380,A,75.0,0
1,2,999001,A,,1
2,3,417740,A,76.0,0
3,4,999001,A,,1
4,5,417740,A,,1
...,...,...,...,...,...
1335571,1335572,417740,C,75.0,0
1335572,1335573,999001,C,,1
1335573,1335574,417380,C,70.0,0
1335574,1335575,999001,C,,1


In [8]:
dates = pd.read_csv('../data/initial_data/dim_date.csv')

In [9]:
dates

Unnamed: 0,dim_date_id,date_val,calendar_year,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday
0,1,2017-01-01,2017,1,1,1,0,0,1
1,2,2017-01-02,2017,1,2,2,1,0,0
2,3,2017-01-03,2017,1,3,3,1,1,0
3,4,2017-01-04,2017,1,4,4,1,1,0
4,5,2017-01-05,2017,1,5,5,1,1,0
...,...,...,...,...,...,...,...,...,...
1515,1516,2021-02-24,2021,2,24,4,1,1,0
1516,1517,2021-02-25,2021,2,25,5,1,1,0
1517,1518,2021-02-26,2021,2,26,6,1,1,0
1518,1519,2021-02-27,2021,2,27,7,0,0,0


In [10]:
pd.unique(dates['dim_date_id']).shape

(1520,)

In [12]:
pas = pd.read_csv('../data/initial_data/dim_pa.csv')

In [13]:
pas

Unnamed: 0,dim_pa_id,correct_diagnosis,tried_and_failed,contraindication,pa_approved
0,1,1,1,0,1
1,2,1,0,0,1
2,3,0,0,1,1
3,4,1,1,0,1
4,5,0,1,0,1
...,...,...,...,...,...
555946,555947,1,0,1,0
555947,555948,1,0,1,1
555948,555949,1,1,1,1
555949,555950,1,0,0,1


In [14]:
pd.unique(pas['dim_pa_id']).shape

(555951,)

In [15]:
bridge = pd.read_csv('../data/initial_data/bridge.csv')

In [11]:
bridge

Unnamed: 0,dim_claim_id,dim_pa_id,dim_date_id
0,1,1.0,1
1,2,,1
2,3,2.0,1
3,4,,1
4,5,,1
...,...,...,...
1335571,1335572,555950.0,1095
1335572,1335573,,1095
1335573,1335574,555951.0,1095
1335574,1335575,,1095


These data have different lengths. So they cannot be merged directly. Theses data are linked through bridge.csv file. So we have to merge these data through bridge.csv file. 

## The bride.csv file

This is a bridge table that links the primary keys of all the tables to one  another that is used for joining the tables. 

It contains the following columns: 
 1. dim_claim_id: Primary key for dim_claims.  
 2. dim_pa_id: Primary key for dim_pa.  
 3. dim_date_id: Primary key for dim_date. 

The dim_claim_id matchs the indeces of bridge.csv. We have to match dim_pa_id and dim_date_id to the indices of bridge.csv (or dim_claim_id). 

In [16]:
bridge['dim_date_id']

0             1
1             1
2             1
3             1
4             1
           ... 
1335571    1095
1335572    1095
1335573    1095
1335574    1095
1335575    1095
Name: dim_date_id, Length: 1335576, dtype: int64

In [17]:
dates = dates.iloc[bridge['dim_date_id']-1].reset_index(drop=True)

In [13]:
dates

Unnamed: 0,dim_date_id,date_val,calendar_year,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday
0,1,2017-01-01,2017,1,1,1,0,0,1
1,1,2017-01-01,2017,1,1,1,0,0,1
2,1,2017-01-01,2017,1,1,1,0,0,1
3,1,2017-01-01,2017,1,1,1,0,0,1
4,1,2017-01-01,2017,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...
1335571,1095,2019-12-31,2019,12,31,3,1,1,0
1335572,1095,2019-12-31,2019,12,31,3,1,1,0
1335573,1095,2019-12-31,2019,12,31,3,1,1,0
1335574,1095,2019-12-31,2019,12,31,3,1,1,0


In [18]:
data = pd.concat([claims,dates],axis=1)

In [19]:
data

Unnamed: 0,dim_claim_id,bin,drug,reject_code,pharmacy_claim_approved,dim_date_id,date_val,calendar_year,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday
0,1,417380,A,75.0,0,1,2017-01-01,2017,1,1,1,0,0,1
1,2,999001,A,,1,1,2017-01-01,2017,1,1,1,0,0,1
2,3,417740,A,76.0,0,1,2017-01-01,2017,1,1,1,0,0,1
3,4,999001,A,,1,1,2017-01-01,2017,1,1,1,0,0,1
4,5,417740,A,,1,1,2017-01-01,2017,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335571,1335572,417740,C,75.0,0,1095,2019-12-31,2019,12,31,3,1,1,0
1335572,1335573,999001,C,,1,1095,2019-12-31,2019,12,31,3,1,1,0
1335573,1335574,417380,C,70.0,0,1095,2019-12-31,2019,12,31,3,1,1,0
1335574,1335575,999001,C,,1,1095,2019-12-31,2019,12,31,3,1,1,0


In [20]:
pd.unique(bridge['dim_pa_id'])

array([1.00000e+00,         nan, 2.00000e+00, ..., 5.55949e+05,
       5.55950e+05, 5.55951e+05])

Now I want merge dim_pa to the data above. However, there are NaN values in dim_pa_id column of bride. So above methode doesnt work. 

In [17]:
bridge['dim_pa_id'].isin(pas['dim_pa_id'])

0           True
1          False
2           True
3          False
4          False
           ...  
1335571     True
1335572    False
1335573     True
1335574    False
1335575    False
Name: dim_pa_id, Length: 1335576, dtype: bool

In [31]:
bridge[bridge['dim_pa_id'].isin(pas['dim_pa_id'])]

Unnamed: 0,dim_claim_id,dim_pa_id,dim_date_id
0,1,1.0,1
2,3,2.0,1
9,10,3.0,1
10,11,4.0,1
14,15,5.0,1
...,...,...,...
1335556,1335557,555947.0,1095
1335557,1335558,555948.0,1095
1335558,1335559,555949.0,1095
1335571,1335572,555950.0,1095


In [20]:
pas['bride_indexs']=pd.Series(bridge[bridge['dim_pa_id'].isin(pas['dim_pa_id'])].index)

In [40]:
pas.set_index('bride_indexs')

Unnamed: 0_level_0,dim_pa_id,correct_diagnosis,tried_and_failed,contraindication,pa_approved
bride_indexs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,1,1,0,1
2,2,1,0,0,1
9,3,0,0,1,1
10,4,1,1,0,1
14,5,0,1,0,1
...,...,...,...,...,...
1335556,555947,1,0,1,0
1335557,555948,1,0,1,1
1335558,555949,1,1,1,1
1335571,555950,1,0,0,1


In [34]:
data = pd.concat([data,pas.set_index('bride_indexs')],axis=1)

In [35]:
data

Unnamed: 0,dim_claim_id,bin,drug,reject_code,pharmacy_claim_approved,dim_date_id,date_val,calendar_year,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday,dim_pa_id,correct_diagnosis,tried_and_failed,contraindication,pa_approved
0,1,417380,A,75.0,0,1,2017-01-01,2017,1,1,1,0,0,1,1.0,1.0,1.0,0.0,1.0
1,2,999001,A,,1,1,2017-01-01,2017,1,1,1,0,0,1,,,,,
2,3,417740,A,76.0,0,1,2017-01-01,2017,1,1,1,0,0,1,2.0,1.0,0.0,0.0,1.0
3,4,999001,A,,1,1,2017-01-01,2017,1,1,1,0,0,1,,,,,
4,5,417740,A,,1,1,2017-01-01,2017,1,1,1,0,0,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335571,1335572,417740,C,75.0,0,1095,2019-12-31,2019,12,31,3,1,1,0,555950.0,1.0,0.0,0.0,1.0
1335572,1335573,999001,C,,1,1095,2019-12-31,2019,12,31,3,1,1,0,,,,,
1335573,1335574,417380,C,70.0,0,1095,2019-12-31,2019,12,31,3,1,1,0,555951.0,0.0,0.0,1.0,0.0
1335574,1335575,999001,C,,1,1095,2019-12-31,2019,12,31,3,1,1,0,,,,,


In [39]:
data.to_csv('../data/processed_data/processed_data.csv')

In [24]:
data.bin.unique()

array([417380, 999001, 417740, 417614])

In [25]:
data.columns

Index(['dim_claim_id', 'bin', 'drug', 'reject_code', 'pharmacy_claim_approved',
       'dim_date_id', 'date_val', 'calendar_year', 'calendar_month',
       'calendar_day', 'day_of_week', 'is_weekday', 'is_workday',
       'is_holiday'],
      dtype='object')