# 0. Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 1. Preamble analysis of the dataset

In [None]:
df = pd.read_csv("./data/covid_data.csv")
df.head()

### @Flo: je me suis permis de commenter ceci car pour l'instant nous ne voulons pas de valeur bool mais bien des int. On verra par la suite si des bool deviennent intéressant, mais cela viendra après le cleanup des données

In [None]:
# boolean_columns = ["INTUBED", "PNEUMONIA", "PREGNANT", "DIABETES", "COPD", "ASTHMA", "INMSUPR", "HIPERTENSION", "OTHER_DISEASE", "CARDIOVASCULAR", "OBESITY", "RENAL_CHRONIC", "TOBACCO", "ICU"]
# 
# # Replace 97 and 99 with NA for the boolean columns
# df[boolean_columns] = df[boolean_columns].replace({1: True, 2: False, 97: None, 98: None, 99: None})
# df.head()

In [None]:
# df["DEAD"] = [x != "9999-99-99" for x in df["DATE_DIED"]]
# df.head()

### 1.1 Analysing existing data

In [None]:
df_null = df.copy()
for i in [97, 98, 99]:
   df_null.replace(i , np.nan, inplace = True)

In [None]:
df_null.isnull().sum()

As we can show with the following graph, there are **a lot** of NA values in the dataset. We will have to handle them.

One approach is to take the mean of the column and replace the NA values with it. However, this is not a good approach, as it will skew the data. We will have to find a better way to handle the NA values.

We will therefore have to go each problematic columun to fix the eventual problematic data

Remember that in the dataset definition it says that 97,98 and 99 are null values, let's deal with that

In [None]:
sns.heatmap(df_null.isnull(), cbar=False)
plt.title('Before data cleanup', color = 'black', fontsize = 15)
plt.show()

#### As we can see, that's no bueno...

In [None]:
df.describe().round(3).T.drop('count', axis = 1)

Furthermore, we have some suspiciously skewed data. Is really half of the population pregnant?!

### 1.2 Managing the DEAD people 

People with a DATE_DIED value of 9999-99-99 simply aren't dead, so we'll just create a new DEAD column to represent that

In [None]:
# Check for any strings in the feature "DATE_DIED"
df['DATE_DIED'][df['DATE_DIED'].apply(lambda x: isinstance(x, str))]

In [None]:
df['DEAD'] = [0 if i=='9999-99-99' else 1 for i in df.DATE_DIED]

In [None]:
df['DEAD'].value_counts()

We'll also replace 9999-99-99 with NaN for the time being

In [None]:
df['DATE_DIED'].replace('9999-99-99', np.nan, inplace = True)

In [None]:
df['DATE_DIED']

And we can convert the date in an actual datetime object

In [None]:
df['DATE_DIED'] =  pd.to_datetime(df['DATE_DIED'], format='%d/%m/%Y', errors='coerce')

In [None]:
df['DATE_DIED'].isnull().sum()

### How does the data look now ?

In [None]:
df.describe().round(3).T.drop('count', axis = 1)


Hmmm, it looks like we'll have to work on the PREGNANT, ICU, and INTUBED people

### 1.3 Pregnant values

In [None]:
df.SEX.value_counts()

In [None]:
df.SEX.shape

Number of females ?

In [None]:
df[(df['SEX'] == 1)].shape

Number of males ?

In [None]:
df[(df['SEX'] == 2)].shape

Pregnant females ?

In [None]:
df[(df['SEX'] == 1)]['PREGNANT']

In [None]:
df[(df['SEX'] == 1)]['PREGNANT'].value_counts()

Pregnant males ?

In [None]:
df[(df['SEX'] == 2)]['PREGNANT']

In [None]:
df[(df['SEX'] == 2) & (df['PREGNANT'])]['PREGNANT'].value_counts()

In [None]:
df['PREGNANT'].value_counts()

It looks like 97 indicates males that aren't pregnant. In other words, for those values we can just input 2 instead of 97

In [None]:
df['PREGNANT'].replace (97, 2, inplace = True)

Finally, 98 represents the females that are unknown to be pregnant or not

In [None]:
df['PREGNANT'].value_counts()

Most people aren't pregnant, this now makes a lot more sense

### 1.4 ICU values

In [None]:
df.ICU.value_counts()

In [None]:
for i in [1, 2, 97, 99]:
    for j in [1, 2]:
        print (f"At PATIENT_TYPE = {j} and at ICU = {i} the shape will be:", "\n")
        print (df[(df['PATIENT_TYPE'] == j) & (df['ICU'] == i)].shape, "\n",
               "--------------------------------------------------------------------------------", "\n\n")

From the above we can see that the missing values of 97 are all corresponding to the values of PATIENT_TYPE = 1 which is for non hospitalized patients, while those of 99 are the missing values of the hospitalized patients, which again can not be told or predicted.

So we can replace all the values of (97) with (2); since obviously patients who have never been hospitalized couldn't possibly be admitted to the ICU.

In [None]:
df['ICU'].replace (97, 2, inplace = True)

In [None]:
df.ICU.value_counts()

### 1.4 INTUBED values

In [None]:
df.INTUBED.value_counts()

In [None]:
for i in [1, 2, 97, 99]:
    for j in [1, 2]:
        print (f"At PATIENT_TYPE = {j} and at INTUBED = {i} the shape will be:", "\n")
        print (df[(df['PATIENT_TYPE'] == j) & (df['INTUBED'] == i)].shape, "\n",
               "--------------------------------------------------------------------------------", "\n\n")

Same logic of the ICU patients: patients that are intubed necessarily are also hospitalized

In [None]:
df['INTUBED'].replace (97, 2, inplace = True)

In [None]:
df.INTUBED.value_counts()

We'll replace everything that we can't infer with NaN

In [None]:
for i in [98, 99]:
   df.replace(i , np.nan, inplace = True)

### How does the data look now ?

In [None]:
# First, let's temporarily reverse the step we did on "DATE_DIED" feature; as they are not really missing:
df_null2 = df.copy()
df_null2.DATE_DIED = df_null2.DATE_DIED.fillna("9999-99-99")

#Let's check again for our missing values:
df_null2.isnull().sum()

In [None]:
df.describe().round(3).T.drop('count', axis = 1)

Looks much better already !
What about a heatmap ?

In [None]:
sns.heatmap(df_null2.isnull())
plt.title('After Data cleanup', color = 'black', fontsize = 15)
plt.show()

In [None]:
# columns_to_delete = []
# for column in df.columns:
#     if df[column].isnull().sum() > 0.5 * df.shape[0]:
#         columns_to_delete.append(column)
#         
# df = df.drop(columns_to_delete, axis=1)
# 
# sns.heatmap(df.isnull(), cbar=False)
# plt.show()