In [11]:
import os
import json
import numpy as np
import pandas as pd
import sqlite3
import functools as ft
import matplotlib.pyplot as plt
%matplotlib inline

ETL - Extract, Transform, Load

Extract: Our data is extracted from an Excel file named 'General details table.xlsx' . In the ETL process, the data is first extracted so that we can work on it.

In [12]:
General_details_df = pd.read_excel('General details table.xlsx')
General_details_df

Unnamed: 0,Date,Patient Number,Routine test,age,sex,weight,height,Smoker
0,2005-06-30,1,1,75,1.0,59.306449,151.0,0.0
1,2005-07-08,1,1,75,1.0,59.970350,151.0,0.0
2,2005-10-24,1,1,75,1.0,64.119390,151.0,1.0
3,2006-01-08,1,1,75,1.0,68.149578,151.0,0.0
4,2006-02-02,1,1,75,1.0,66.977856,151.0,0.0
...,...,...,...,...,...,...,...,...
99995,2019-10-09,300,0,90,1.0,73.336318,158.0,0.0
99996,2019-11-19,300,0,90,1.0,75.462573,158.0,0.0
99997,2019-11-21,300,0,90,1.0,82.255023,158.0,1.0
99998,2019-12-17,300,0,90,1.0,83.482713,158.0,0.0


In [13]:
General_details_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Date            100000 non-null  datetime64[ns]
 1   Patient Number  100000 non-null  int64         
 2   Routine test    100000 non-null  int64         
 3   age             100000 non-null  int64         
 4   sex             99944 non-null   float64       
 5   weight          100000 non-null  float64       
 6   height          99888 non-null   float64       
 7   Smoker          98710 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int64(3)
memory usage: 6.1 MB


In [14]:
General_details_df.shape

(100000, 8)

Transform: The ETL's central process is in which various changes are made to the original data so that we can adapt the data to the project's goals. In addition, in the process, we clean the data of empty values ​​and noise so that they do not interfere with drawing conclusions from the data.

In [15]:
General_details_df.isnull().sum()

Date                 0
Patient Number       0
Routine test         0
age                  0
sex                 56
weight               0
height             112
Smoker            1290
dtype: int64

In [16]:
General_details_df = General_details_df.dropna()
General_details_df

Unnamed: 0,Date,Patient Number,Routine test,age,sex,weight,height,Smoker
0,2005-06-30,1,1,75,1.0,59.306449,151.0,0.0
1,2005-07-08,1,1,75,1.0,59.970350,151.0,0.0
2,2005-10-24,1,1,75,1.0,64.119390,151.0,1.0
3,2006-01-08,1,1,75,1.0,68.149578,151.0,0.0
4,2006-02-02,1,1,75,1.0,66.977856,151.0,0.0
...,...,...,...,...,...,...,...,...
99995,2019-10-09,300,0,90,1.0,73.336318,158.0,0.0
99996,2019-11-19,300,0,90,1.0,75.462573,158.0,0.0
99997,2019-11-21,300,0,90,1.0,82.255023,158.0,1.0
99998,2019-12-17,300,0,90,1.0,83.482713,158.0,0.0


In [17]:
General_details_df.duplicated().sum()

0

Sort the DF by Patient Number and then by date

In [18]:
General_details_df.sort_values(by=['Patient Number', 'Date'], inplace=True)
General_details_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  General_details_df.sort_values(by=['Patient Number', 'Date'], inplace=True)


Unnamed: 0,Date,Patient Number,Routine test,age,sex,weight,height,Smoker
0,2005-06-30,1,1,75,1.0,59.306449,151.0,0.0
50000,2005-06-30,1,0,75,1.0,59.306449,151.0,0.0
1,2005-07-08,1,1,75,1.0,59.970350,151.0,0.0
50001,2005-07-08,1,0,75,1.0,59.970350,151.0,0.0
2,2005-10-24,1,1,75,1.0,64.119390,151.0,1.0
...,...,...,...,...,...,...,...,...
99997,2019-11-21,300,0,90,1.0,82.255023,158.0,1.0
49998,2019-12-17,300,1,90,1.0,83.482713,158.0,0.0
99998,2019-12-17,300,0,90,1.0,83.482713,158.0,0.0
49999,2019-12-26,300,1,90,1.0,80.337148,158.0,0.0


Load: Loading all the tables and merging them into one final table. Because in our project we worked only on a Fact table, we will present the final table after the Transform.

In [19]:
final_General_details_df = General_details_df.copy()
final_General_details_df

Unnamed: 0,Date,Patient Number,Routine test,age,sex,weight,height,Smoker
0,2005-06-30,1,1,75,1.0,59.306449,151.0,0.0
50000,2005-06-30,1,0,75,1.0,59.306449,151.0,0.0
1,2005-07-08,1,1,75,1.0,59.970350,151.0,0.0
50001,2005-07-08,1,0,75,1.0,59.970350,151.0,0.0
2,2005-10-24,1,1,75,1.0,64.119390,151.0,1.0
...,...,...,...,...,...,...,...,...
99997,2019-11-21,300,0,90,1.0,82.255023,158.0,1.0
49998,2019-12-17,300,1,90,1.0,83.482713,158.0,0.0
99998,2019-12-17,300,0,90,1.0,83.482713,158.0,0.0
49999,2019-12-26,300,1,90,1.0,80.337148,158.0,0.0
