<a href="https://colab.research.google.com/github/Fuenfgeld/DMA2023TeamC/blob/main/Ergebnisse/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EDA
Explorative Data Analysis



Reset the existing variables

In [None]:
%reset -f

In [None]:
# install the newest version 
!pip3 install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip


# Importing Libraries

In [None]:
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from pandas_profiling import ProfileReport
import pandas_profiling
from pandas.util import hash_pandas_object

In [None]:
# will make plot outputs appear and stored within the notebook.
%matplotlib inline

In [None]:
from google.colab import drive
# mount drive to access database
drive.mount("/content/drive")

### Version check

The versions of the packages when working on the project are:  
csv:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1.0  
sqlite3: &nbsp;3.31.1  
numpy: &nbsp;1.21.6  
matplotlib: 3.3.2  
seaborn: 0.11.2  
pandas_profiling: 1.4.1  
pandas: 1.3.5

Python: &nbsp;3.8.10

In [None]:
print('The current version of pandas is ' + pd.__version__)
print('The current version of sqlite3 is ' + sqlite3.sqlite_version)
print('The current version of seaborn is ' + sns.__version__)
print('The current version of matplotlib is ' + matplotlib.__version__)
print('The current version of numpy is ' + np.__version__)

print('The current python version is ', end=' ')
!python --version
print('The current version of pandas_profiling is in the infobox below: ')
!pip show pandas_profiling


# Loading Data


In [None]:
# type of patient
patient_type = "metebolic_syndrome_disease"

In [None]:
# datawarehouse path
DB_DWH_PATH = "/content/drive/Shareddrives/TeamC/teamc_dwh.db"
print("Datawarehouse: ", DB_DWH_PATH)
# connect to db
dwh_conn = sqlite3.connect(DB_DWH_PATH)

In [None]:
# list of tables in db
if dwh_conn is not None:
  dwh_cursor = dwh_conn.cursor()
  dwh_cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
  print("List of Tables", dwh_cursor.fetchall())


In [None]:
# Read sqlite query results into a pandas DataFrame

# demographic data
df_patients = pd.read_sql_query("SELECT * FROM patients_info", dwh_conn)

# diagnoses data
df_conditions = pd.read_sql_query("select * from conditions_info", dwh_conn)

# medications data
df_medications = pd.read_sql_query("SELECT * FROM medications_info", dwh_conn)

# medications codes data
df_med_codes = pd.read_sql_query("SELECT * FROM med_codes", dwh_conn)

# conditions codes data
df_conditions_codes = pd.read_sql_query("SELECT * FROM conditions_codes", dwh_conn)


## Check the Checksum in all the dataframes

In [None]:
check_patients = df_patients.drop(["Id"], axis = 1, inplace = False)
check_patients = hash_pandas_object(check_patients)

In [None]:
check_conditions = df_conditions.drop(["PATIENT"], axis = 1, inplace = False)
check_conditions = hash_pandas_object(check_conditions)

In [None]:
check_medications = df_medications.drop(["PATIENT"], axis = 1, inplace = False)
check_medications = hash_pandas_object(check_medications)

In [None]:
check_medcodes = hash_pandas_object(df_med_codes)

In [None]:
check_concodes = hash_pandas_object(df_conditions_codes)

In [None]:
x = check_patients.sum() + check_conditions.sum() + check_medications.sum() + check_medcodes.sum() + check_concodes.sum()

In [None]:
if x == 2633809765930772868:
  print('The dataset is the same as in our analysis')

## Close db connection

In [None]:
# close db connection
dwh_conn.close()


# Exploratory Data Analysis (EDA)


In [None]:
ProfileReport(df_patients)

In [None]:
df_patients.head()

In [None]:
df_patients.columns

In [None]:
df_patients.shape

In [None]:
# # replacing blank values (with space) with NAN
df_patients = df_patients.replace(r'^\s*$', np.nan, regex=True)

In [None]:
df_patients.isnull().sum()

In [None]:
df_patients.DEATHDATE.isnull().sum()

In [None]:
# fill nall values with todays date
df_patients["DEATHDATE"] = df_patients.DEATHDATE.fillna(pd.to_datetime("today"))
# convert to datetime
df_patients["DEATHDATE"] = pd.to_datetime(df_patients["DEATHDATE"])
df_patients["BIRTHDATE"] = pd.to_datetime(df_patients["BIRTHDATE"])

# # calculate age 
df_patients["AGE"] = df_patients.DEATHDATE.dt.year - df_patients.BIRTHDATE.dt.year
## select variables
df_patients = df_patients[["Id", "AGE", "ETHNICITY", "RACE"]]



In [None]:
df_patients.head()


In [None]:
df_conditions.head()

In [None]:
ProfileReport(df_conditions)

In [None]:
# rename some columns for clarification
df_conditions.rename(columns={"CODE": "CODE_CONDITION", "START": "START_CONDITION", "STOP" : "STOP_CONDITION"}, inplace=True)

# replacing blank values (with space) with NAN
df_conditions = df_conditions.replace(r'^\s*$', np.nan, regex=True)

# fill null values
df_conditions["STOP_CONDITION"] = df_conditions.STOP_CONDITION.fillna(pd.to_datetime("today"))

# convert to datetime
df_conditions["START_CONDITION"] = pd.to_datetime(df_conditions["START_CONDITION"])
df_conditions["STOP_CONDITION"] = pd.to_datetime(df_conditions["STOP_CONDITION"])

# calculate duration of a diagnose
df_conditions["DURATION_CONDITION"] = df_conditions.STOP_CONDITION.dt.to_period("M").astype(int) - df_conditions.START_CONDITION.dt.to_period("M").astype(int)

# remove unnecessary variables
#df_conditions.drop(['START', 'STOP'], axis=1, inplace=True)

In [None]:
df_conditions.shape

In [None]:
# join two datasets
df = pd.DataFrame.merge(df_patients, df_conditions, left_on="Id", right_on="PATIENT", how="left")
df.drop("PATIENT", axis=1, inplace=True)
df.shape

In [None]:
df.head()

In [None]:
ProfileReport(df_medications)

In [None]:
df_medications.head()

In [None]:
# rename variables
df_medications.rename(columns={"CODE":"CODE_MEDICATION",
                               "START" : "START_MEDICATION",
                               "STOP" : "STOP_MEDICATION"}, inplace=True)


# replacing blank values (with space) with NAN
df_medications = df_medications.replace(r'^\s*$', np.nan, regex=True)

# fill null values
df_medications["STOP_MEDICATION"] = df_medications.STOP_MEDICATION.fillna(pd.to_datetime("today"))

# convert to datetime
df_medications["START_MEDICATION"] = pd.to_datetime(df_medications["START_MEDICATION"])
df_medications["STOP_MEDICATION"] = pd.to_datetime(df_medications["STOP_MEDICATION"])

# calculate duration of a diagnose
df_medications["DURATION_MEDICATION"] = df_medications.STOP_MEDICATION.dt.to_period("M").astype(int) - df_medications.START_MEDICATION.dt.to_period("M").astype(int)

In [None]:
df = pd.DataFrame.merge(df, df_medications, left_on="Id", right_on="PATIENT", how="left")
df.drop("PATIENT", axis=1, inplace=True)
df.shape

In [None]:
ProfileReport(df_med_codes)

In [None]:
df_med_codes.columns

In [None]:
df_med_codes.shape

In [None]:
# rename variables
df_med_codes.rename(columns={"DESCRIPTION" : "DESCRIPTION_MEDICATION"}, inplace=True)

In [None]:
# join two datasets
df = pd.DataFrame.merge(df, df_med_codes, left_on="CODE_MEDICATION", right_on="CODE", how="left")
df.drop("CODE", axis=1, inplace=True)
df.shape

In [None]:
ProfileReport(df_conditions_codes)

In [None]:
df_conditions_codes.head(5)

In [None]:
df_conditions_codes.shape

In [None]:
df_conditions_codes.columns

In [None]:
# rename variables
df_conditions_codes.rename(columns={"DESCRIPTION" : "DESCRIPTION_CONDITION"}, inplace=True)

In [None]:
# join two datasets
df = pd.DataFrame.merge(df, df_conditions_codes, left_on="CODE_CONDITION", right_on="CODE", how="left")
df.drop("CODE", axis=1, inplace=True)
df.shape

## Understanding Data/Basic Data Exploration


In [None]:
# .head() returns the first 5 rows of my dataset. This is useful if you want to see some example values for each variable.
df.head()

In [None]:
# .shape returns the number of rows by the number of columns
#size of dataset
df.shape

In [None]:
# .columns returns the name of all of your columns in the dataset.
df.columns

In [None]:
# attribute type
df.info()

In [None]:
# .describe summarizes the count, mean, standard deviation, min, and max for numerical variables.
df.describe()

In [None]:
# .nunique(axis=0) returns the number of unique values for each variable.
df.nunique(axis=0)


## Cleaning Dataset


### Removing Duplicate Rows


In [None]:
print("Number of Duplicated Rows", df.duplicated(df.columns).sum())

### Removing Missing Values

For the purposes of our research question the missing value rows stay.
the missing value rows mean, the patient has no conditions or doesn't take any medication.

In [None]:
# number of null values in each column
df.isnull().sum()

In [None]:
# for the purposes of our reseatch question the missing value rows stay.
# the missing value rows mean, the patient has no additional conditions or doesn't take
# any medication.


## Univariate Analysis

### Grouping

Get the number of elements per group using .size method

In [None]:
# number of patients in procedure table
df.groupby(["Id"]).size()

In [None]:
df.groupby(["Id", "CODE_CONDITION"]).size()

In [None]:
df.groupby(["Id", "CODE_MEDICATION"]).size()

Now we'll group the patients and sum up the amount of their different medication and conditions.

In [None]:
pat_med = pd.DataFrame.merge(df_patients, df_medications, left_on="Id", right_on="PATIENT", how="left")
pat_med.drop("PATIENT", axis=1, inplace=True)

moddf_med= pat_med.groupby(['Id'])['CODE_MEDICATION'].count().reset_index(name = 'medications')
moddf_med

In [None]:
pat_con = pd.DataFrame.merge(df_patients, df_conditions, left_on="Id", right_on="PATIENT", how="left")
pat_con.drop("PATIENT", axis=1, inplace=True)


moddf = pat_con.groupby(['Id'])['CODE_CONDITION'].count().reset_index(name = 'additional_conditions')

moddf

For some patients, there are more than one diagnose code and medication code. However, for the most, the number of additional conditions, as well as the number of different drugs, seems to be 0.





# Plotting the data

## Histogram



Now the amount of different additional conditions will be plotted on a histogram. The following bars show the amount of patients having a specified number of additional conditions.

In [None]:
moddf.plot.hist(figsize=(12,6), facecolor='grey',edgecolor='black', bins = 10)



As is to be seen above, the bulk of the patients do not have any accompanying conditions. Now, the very same histogram after dropping the ones with 0, just for better clarity.

In [None]:
moddf = moddf[moddf['additional_conditions'] > 0]
moddf.plot.hist(figsize=(12,6), facecolor='grey',edgecolor='black')

Now the amount of different medications will be plotted on a histogram. The following bars show the amount of patients having a specified number of medications.

In [None]:
moddf_med.plot.hist(figsize=(12,6), facecolor='grey',edgecolor='black', bins = 10)

And again, just for clarity, now the same histogram after dropping the patients with 0 medications

In [None]:
moddf_med = moddf_med[moddf_med['medications'] > 0]
moddf_med.plot.hist(figsize=(12,6), facecolor='grey',edgecolor='black')

Now, for the purposes of our research question, the patients will be grouped according to their race and ethnicity. The number of people for each group will be shown.

In [None]:
rac_eth = df_patients.groupby(['RACE', 'ETHNICITY'])['Id'].count().reset_index(name = 'count')

rac_eth

The above data will now be shown on a histogram to visualize things.

In [None]:
sns.countplot(data = df_patients, x = 'RACE', hue = 'ETHNICITY')

It is now clear, that some groups are much more represented that the others. What this means, is that it's possible, that some of the groups will be to small to enable proper analysis.

## Distribution of the 'race' value

The following graph shows the ditribution of different 'race' values within the dataset.

In [None]:
# distribution of RACE attribute
sns.countplot(x="variable", hue= "value", data=pd.melt(df[["RACE"]]))

## Distribution of the 'Ethnicity' value

The following graph shows the ditribution of different 'ethnicity' values within the dataset.

In [None]:
# distribution of ETHNICITY attribute
sns.countplot(x="variable", hue= "value", data=pd.melt(df[["ETHNICITY"]]))

### Average numbers of conditions and medications, Outliers
Now new tables will be created with average number of conditions for every group of the above.
 
First, it will be done with conditions and then with medications.

Outliers will be dropped for better clarity od the analysis.

In [None]:
# mean conditions count when grouped by race and ethnicity

meancon = pat_con[['Id', 'RACE', 'ETHNICITY', 'CODE_CONDITION']]

#add a column to give sum of conditions per person
meancon["con_count"] = meancon.groupby('Id')["CODE_CONDITION"].transform("count")

#drop the code_condition column
meancon = meancon.drop(columns = 'CODE_CONDITION')

#drop the duplicates
meancon = meancon.drop_duplicates()

#add a new column with the mean of conditions a patient in every group has
meancon["mean_con"] = meancon.groupby(['RACE', 'ETHNICITY'])["con_count"].transform("mean")

#drop the Id and con_count columns
meancon = meancon.drop(columns=['con_count', 'Id'])

#drop the duplicates so that only the division in race and ethnicity remains, then sort
meancon = meancon.drop_duplicates()
meancon = meancon.sort_values(by = ['RACE', 'ETHNICITY'])
meancon


It is clear, that the groups with **0 mean conditions** were simply too small for any additional condition to berecorded. Those need to be dropped

In [None]:
# dropping the groups with 0 mean conditions

meancon = meancon[(meancon.mean_con > 0)]
meancon

Now repeat the process for the medications to see what is the average number of meds that every group takes.

In [None]:
meanmed = pat_med[['Id', 'RACE', 'ETHNICITY', 'CODE_MEDICATION']]

#add a column to give sum of conditions per person
meanmed["med_count"] = meanmed.groupby('Id')["CODE_MEDICATION"].transform("count")

#drop the code_condition column
meanmed = meanmed.drop(columns = 'CODE_MEDICATION')

#drop the duplicates
meanmed = meanmed.drop_duplicates()

#add a new column with the mean of conditions a patient in every group has
meanmed["mean_med"] = meanmed.groupby(['RACE', 'ETHNICITY'])["med_count"].transform("mean")

#drop the Id and con_count columns
meanmed = meanmed.drop(columns=['med_count', 'Id'])

#drop the duplicates so that only the division in race and ethnicity remains, then sort
meanmed = meanmed.drop_duplicates()
meanmed = meanmed.sort_values(by = ['RACE', 'ETHNICITY'])
meanmed

It is clear, that **the groups with 0 mean medications** were simply too small for any medication to be recorded. The **native nonhispanic group** is also an outlier, which can also be attributed to a too small
patient group. Those groups need to be dropped

In [None]:
# dropping the groups

meanmed = meanmed[(meanmed.mean_med > 0) & (meanmed.mean_med != 1)]

meanmed

## Mean additional conditions

First, mean additional conditions will be plotted for every group (after dropping the most obvious outliers)

In [None]:
# mean mconditions number plotted against race and ethnicity
sns.catplot(data = meancon, x = 'RACE', y='mean_con', hue = 'ETHNICITY', kind='bar')

From the above graph it is clear, that because the hispanic ethnicity is only largely enough represented in the 'white' race, it cannot be used to answer the question on whether it influences the morbidity
The 'native' race hat the highest average number of conditions, most possibly it's due to the underrepresentation of this group in the dataset.

## Mean medications

Now, mean medications will be plotted for every group (after dropping the most obvious outliers)

In [None]:
sns.catplot(data = meanmed, x = 'RACE', y='mean_med', hue = 'ETHNICITY', kind='bar')