In [12]:
import os
import json
import numpy as np
import pandas as pd
import sqlite3
import functools as ft
import matplotlib.pyplot as plt
%matplotlib inline

ETL - Extract, Transform, Load

Extract: Our data is extracted from an Excel file named 'Fact Table.xlsx' . In the ETL process, the data is first extracted so that we can work on it.

In [13]:
Fact_df = pd.read_excel('Fact Table.xlsx')
Fact_df

Unnamed: 0,date,Patient Number,age,Routine test
0,2005-06-30,1,75,1
1,2005-07-08,1,75,1
2,2005-10-24,1,75,1
3,2006-01-08,1,75,1
4,2006-02-02,1,75,1
...,...,...,...,...
99995,2019-10-09,300,90,0
99996,2019-11-19,300,90,0
99997,2019-11-21,300,90,0
99998,2019-12-17,300,90,0


In [14]:
Fact_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   date            100000 non-null  datetime64[ns]
 1   Patient Number  100000 non-null  int64         
 2   age             100000 non-null  int64         
 3   Routine test    100000 non-null  int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 3.1 MB


In [15]:
Fact_df.shape

(100000, 4)

Transform: The ETL's central process is in which various changes are made to the original data so that we can adapt the data to the project's goals. In addition, in the process, we clean the data of empty values ​​and noise so that they do not interfere with drawing conclusions from the data.

In [16]:
Fact_df.isnull().sum()

date              0
Patient Number    0
age               0
Routine test      0
dtype: int64

In [17]:
Fact_df.duplicated().sum()

3342

In [18]:
duplicat_df = Fact_df.copy()
duplicat_df['duplicated'] = duplicat_df.duplicated()
duplicat_df

Unnamed: 0,date,Patient Number,age,Routine test,duplicated
0,2005-06-30,1,75,1,False
1,2005-07-08,1,75,1,False
2,2005-10-24,1,75,1,False
3,2006-01-08,1,75,1,False
4,2006-02-02,1,75,1,False
...,...,...,...,...,...
99995,2019-10-09,300,90,0,True
99996,2019-11-19,300,90,0,False
99997,2019-11-21,300,90,0,False
99998,2019-12-17,300,90,0,False


In [19]:
Fact_df.drop_duplicates(inplace=True)
Fact_df

Unnamed: 0,date,Patient Number,age,Routine test
0,2005-06-30,1,75,1
1,2005-07-08,1,75,1
2,2005-10-24,1,75,1
3,2006-01-08,1,75,1
4,2006-02-02,1,75,1
...,...,...,...,...
99994,2019-10-09,300,90,0
99996,2019-11-19,300,90,0
99997,2019-11-21,300,90,0
99998,2019-12-17,300,90,0


sort the DF by Patient Number and then by date

In [20]:
Fact_df.sort_values(by=['Patient Number', 'date'], inplace=True)

Fact_df_sorted = Fact_df.copy()

Fact_df_sorted['duration'] = Fact_df_sorted.groupby('Patient Number')['date'].diff()
Fact_df_sorted

Unnamed: 0,date,Patient Number,age,Routine test,duration
0,2005-06-30,1,75,1,NaT
50000,2005-06-30,1,75,0,0 days
1,2005-07-08,1,75,1,8 days
50001,2005-07-08,1,75,0,0 days
2,2005-10-24,1,75,1,108 days
...,...,...,...,...,...
99997,2019-11-21,300,90,0,0 days
49998,2019-12-17,300,90,1,26 days
99998,2019-12-17,300,90,0,0 days
49999,2019-12-26,300,90,1,9 days


In [24]:
temp = Fact_df_sorted[Fact_df_sorted['Routine test']==1]
Outliers = temp[temp['duration'] > pd.Timedelta(days=900)]
Outliers

Unnamed: 0,date,Patient Number,age,Routine test,duration
1075,2007-10-25,4,77,1,974 days
1515,2012-05-07,6,77,1,1054 days
15776,2015-10-05,86,97,1,1084 days
17379,2019-07-28,99,75,1,949 days
17583,2011-07-11,105,77,1,1147 days
25035,2011-07-20,148,77,1,919 days
25042,2016-12-12,148,77,1,915 days
27413,2014-07-13,161,79,1,1232 days
35249,2009-01-20,213,78,1,906 days
46044,2017-07-20,280,88,1,1191 days


Drope outliers

In [25]:
Fact_df_sorted.drop(Outliers.index, inplace=True)
Fact_df_sorted

Unnamed: 0,date,Patient Number,age,Routine test,duration
0,2005-06-30,1,75,1,NaT
50000,2005-06-30,1,75,0,0 days
1,2005-07-08,1,75,1,8 days
50001,2005-07-08,1,75,0,0 days
2,2005-10-24,1,75,1,108 days
...,...,...,...,...,...
99997,2019-11-21,300,90,0,0 days
49998,2019-12-17,300,90,1,26 days
99998,2019-12-17,300,90,0,0 days
49999,2019-12-26,300,90,1,9 days


Load: Loading all the tables and merging them into one final table. Because in our project we worked only on a Fact table, we will present the final table after the Transform.

In [26]:
final_df = Fact_df_sorted.copy()
final_df.drop(columns=['duration'], inplace=True)
final_df

Unnamed: 0,date,Patient Number,age,Routine test
0,2005-06-30,1,75,1
50000,2005-06-30,1,75,0
1,2005-07-08,1,75,1
50001,2005-07-08,1,75,0
2,2005-10-24,1,75,1
...,...,...,...,...
99997,2019-11-21,300,90,0
49998,2019-12-17,300,90,1
99998,2019-12-17,300,90,0
49999,2019-12-26,300,90,1
