# 0. Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import utils as util

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, RocCurveDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# 1. Preamble analysis of the dataset

In [None]:
df = pd.read_csv("./data/covid_data.csv")
df.head()

### 1.1 Analysing existing data

In [None]:
df_null = df.copy()
for i in [97, 98, 99]:
   df_null.replace(i , np.nan, inplace = True)

In [None]:
df_null.isnull().sum()

As we can show with the following graph, there are **a lot** of NA values in the dataset. We will have to handle them.

One approach is to take the mean of the column and replace the NA values with it. However, this is not a good approach, as it will skew the data. We will have to find a better way to handle the NA values.

We will therefore have to go each problematic columun to fix the eventual problematic data

Remember that in the dataset definition it says that 97,98 and 99 are null values, let's deal with that

In [None]:
# sns.heatmap(df_null.isnull(), cbar=False)
# plt.title('Before data cleanup', color = 'black', fontsize = 15)
# plt.show()

#### As we can see, that's no bueno...

In [None]:
df.describe().round(3).T.drop('count', axis = 1)

Furthermore, we have some suspiciously skewed data. Is really half of the population pregnant?!

### 1.2 Managing the DEAD people 

People with a DATE_DIED value of 9999-99-99 simply aren't dead, so we'll just create a new DEAD column to represent that

In [None]:
# Check for any strings in the feature "DATE_DIED"
df['DATE_DIED'][df['DATE_DIED'].apply(lambda x: isinstance(x, str))]

In [None]:
df['DEAD'] = [2 if i=='9999-99-99' else 1 for i in df.DATE_DIED]

In [None]:
df['DEAD'].value_counts(normalize=True)

We'll also replace 9999-99-99 with NaN for the time being

In [None]:
df = df.drop('DATE_DIED', axis=1)

#### How does the data look now ?

In [None]:
df.describe().round(3).T.drop('count', axis = 1)


Hmmm, it looks like we'll have to work on the PREGNANT, ICU, and INTUBED people

### 1.3 SEX Values

We'll just set the 1 and 2 values to "Female" and "Male" respectively

### 1.4 Pregnant values

In [None]:
df.SEX.value_counts()

In [None]:
df.SEX.shape

Pregnant females ?

In [None]:
df[(df['SEX'] == 1)]['PREGNANT']

In [None]:
df[(df['SEX'] == 1)]['PREGNANT'].value_counts()

Pregnant males ?

In [None]:
df[(df['SEX'] == 2)]['PREGNANT']

In [None]:
df[(df['SEX'] == 2) & (df['PREGNANT'])]['PREGNANT'].value_counts()

In [None]:
df['PREGNANT'].value_counts()

It looks like 97 indicates males that aren't pregnant. In other words, for those values we can just input 2 instead of 97

In [None]:
df['PREGNANT'].replace (97, 2, inplace = True)

Finally, 98 represents the females that are unknown to be pregnant or not

In [None]:
df['PREGNANT'].value_counts()

We can simply mark then as NA as the data is logically Not Available

In [None]:
df['PREGNANT'].replace(98, None, inplace = True)
df['PREGNANT'].value_counts(normalize=True)

Most people aren't pregnant, this now makes a lot more sense

### 1.5 ICU values

In [None]:
df.ICU.value_counts()

In [None]:
pd.crosstab(df['PATIENT_TYPE'], df['ICU'])

From the above we can see that the missing values of 97 are all corresponding to the values of PATIENT_TYPE = 1 which is for non hospitalized patients, while those of 99 are the missing values of the hospitalized patients, which again can not be told or predicted.

So we can replace all the values of (97) with (2); since obviously patients who have never been hospitalized couldn't possibly be admitted to the ICU.

In [None]:
df['ICU'].replace (97, 2, inplace = True)

In [None]:
df.ICU.value_counts()

### 1.6 INTUBED values

In [None]:
df.INTUBED.value_counts()

In [None]:
pd.crosstab(df['PATIENT_TYPE'], df['INTUBED'])

Same logic of the ICU patients: patients that are intubed necessarily are also hospitalized

In [None]:
df['INTUBED'].replace (97, 2, inplace = True)

In [None]:
df.INTUBED.value_counts()

We'll replace everything that we can't infer with NaN

In [None]:
for i in [98, 99]:
   df.replace(i , np.nan, inplace = True)

#### How does the data look now ?

In [None]:
# First, let's temporarily reverse the step we did on "DATE_DIED" feature; as they are not really missing:
df_null2 = df.copy()

#Let's check again for our missing values:
df_null2.isnull().sum()

In [None]:
df.describe().round(3).T.drop('count', axis = 1)

Looks much better already !
What about a heatmap ?

In [None]:
# sns.heatmap(df_null2.isnull(), cbar=False)
# plt.title('After Data cleanup', color = 'black', fontsize = 15)
# plt.show()

In [None]:
# fig, ax = plt.subplots(figsize=(20, 15))
# mask=np.triu(np.ones_like(df.corr()))
# sns.heatmap(df.corr(), mask = mask, annot = True, cmap = "Blues", vmin = -1, vmax = 1)
# plt.title('Data Correlation', color = 'black', fontsize = 30)
# plt.show()

### 1.7 Hospitalization

Hospitalization is described by the `PATIENT_TYPE` column. It has a value of either 1: at home or 2: in hospital. We can change this column to a boolean column which, instead of describing the patient type, will describe if the patient is hospitalized or not. That means that we will have to change the column name to `HOSPITALIZED`, but we'll also have to invert all the values.

In [None]:
df.head()

In [None]:
df["HOSPITALIZED"] = [1 if i == 2 else 2 for i in df["PATIENT_TYPE"]]
df[["HOSPITALIZED", "PATIENT_TYPE"]].head()

We'll drop the column as it is now redundant

In [None]:
df = df.drop("PATIENT_TYPE", axis=1)
df.head()

### 1.8 Readability fix

As we can see, the column names are not very readable, so we'll just fix that. In the original datasheet, it is said that the boolean values, 1 and 2, are actually "Yes" and "No" respectively. We'll just change that as well to make them boolean.

In [None]:
# Get columns whose data unique count is equal to 2 and are either 1, 2 or NA
binary_cols = [col for col in df.columns if df[col].nunique() == 2 and df[col].dropna().value_counts().index.isin([1,2]).all()]
binary_cols

The number of boolean columns described in the datasheet is 15, but we have 16. The culprit is the `USMER` column, which does have only two values, but do not describe a boolean value. We'll just remove it from the list.

In [None]:
binary_cols.remove("USMER")
binary_cols

Now that we have a list of boolean value columns, we can change the values to strings of either "Y" or "N" and then change the column type to categorical.

In [None]:
# Change the values of the binary columns to "Y" if 1, "N" if 2
# for col in binary_cols:
#     df[col] = df[col].replace({1: "Y", 2: "N"})
# df.describe().round(3).T.drop('count', axis = 1)

### 1.9 Categorizing the columns

In [None]:
# For every columns, display the number of unique values
df.nunique()

In [None]:
non_categorical = ["AGE"]
categorical = df.columns.drop(non_categorical)

for category in categorical:
    df[category].astype("category")
    
df.describe().round(3).T.drop('count', axis = 1)

## Dealing with null data

In [None]:
# Select only the categorical columns from the DataFrame
categorical_data = df[categorical]

# Count the total number of rows in the categorical_data DataFrame
total_rows = len(categorical_data)

# Count the number of null values in each categorical column
null_rows_count = categorical_data.isnull().sum()

# Calculate the percentage of null values for each column
percentage_null_values = (null_rows_count / total_rows) * 100

# Create a DataFrame to display the results
null_summary = pd.DataFrame({
    'Null Values Count': null_rows_count,
    'Total Values Count': total_rows,
    'Percentage of Null Values': percentage_null_values
})

# Print the summary
print("Summary of null values in each categorical column:")
print(null_summary)


In [None]:
# Calculate the initial number of rows
initial_rows = len(categorical_data)

# Remove rows with null values
categorical_data_cleaned = categorical_data.dropna()

# Calculate the number of removed lines
removed_lines = initial_rows - len(categorical_data_cleaned)

# Calculate the number of lines that remain after removal
remaining_lines = len(categorical_data_cleaned)

# Print the number of removed lines and remaining lines
print(f"Number of removed lines: {removed_lines}")
print(f"Number of lines remaining: {remaining_lines}")

# Optionally, you can assign the cleaned DataFrame to a new variable or overwrite the original one
# categorical_data = categorical_data_cleaned


In [None]:
# Calculate the number of rows to export (10% of the total cleaned data)
# percentage_to_export = 0.1
# num_rows_to_export = int(len(categorical_data_cleaned) * percentage_to_export)

# Set a random seed for reproducibility
# random_seed = 42  # You can use any integer as the seed

# Use the sample method to randomly select the specified number of rows
# random_subset = categorical_data_cleaned.sample(n=num_rows_to_export, random_state=random_seed)

# Specify the path where you want to save the cleaned subset of data to a CSV file
# cleaned_subset_output_file = "cleaned_categorical_data_subset.csv"

# Export the cleaned subset of data to a CSV file
# random_subset.to_csv(cleaned_subset_output_file, index=False)

# Optionally, you can read the cleaned subset data back into a DataFrame if needed
# cleaned_subset_df = pd.read_csv(cleaned_subset_output_file)

# Display the first few rows of the cleaned subset DataFrame
# print("\nFirst few rows of the cleaned subset DataFrame:")
# print(cleaned_subset_df.head())

### Replacing the 1, 2 values with 1, 0 respectively

In [None]:
df = df.replace({2: 0})

# Looking for correlation

In [None]:
df.corr()['DEAD']

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
sorted_corr = df.copy()
sorted_corr = sorted_corr.corr()[['DEAD']].sort_values(by='DEAD', ascending=False)

sns.heatmap(sorted_corr, annot = True, cmap = "Blues", vmin = -1, vmax = 1)
plt.title('Death factors correlation', color = 'black', fontsize = 30)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sorted_corr_relevant = df.copy()
correlation_matrix = sorted_corr_relevant.corr()[['DEAD']]

filtered_corr = correlation_matrix[(correlation_matrix > 0.1) | (correlation_matrix < -0.1)].dropna()

fig, ax = plt.subplots(figsize=(20, 15))

sns.heatmap(filtered_corr.sort_values(by='DEAD', ascending=False), annot=True, cmap="Blues", vmin=-1, vmax=1)
plt.title('Death factors correlation', color='black', fontsize=30)
plt.show()


# Decision tree

## Basic example

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

X = df.copy().drop('DEAD', axis=1)
Y = df['DEAD']

X = X.fillna(X.mean())

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

dtree = DecisionTreeClassifier()
dtree = dtree.fit(x_train, y_train)

In [None]:
print("Decision Tree Train Accuracy: ", dtree.score(x_train,y_train), "\n")
print("Decision Tree Test Accuracy:", dtree.score(x_test, y_test), "\n")

In [None]:
y_pred = dtree.predict(x_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [None]:
dtree_report = pd.DataFrame.from_dict(classification_report(y_test, y_pred, target_names = ["Deadn't", "Dead"], output_dict=True)).T
dtree_report

## Correlation based Tree data selection

Here I just select the more relevant columns based on the correlation results.

In [None]:
bound = 0.4
sorted_corr.sort_values(by='DEAD', ascending=False)
filtered_corr = sorted_corr[(sorted_corr['DEAD'] > bound) | (sorted_corr['DEAD'] < -bound)]
filtered_corr.sort_values(by='DEAD', ascending=False)
unique_names = filtered_corr.index.tolist()
unique_names.remove('DEAD')
unique_names

In [None]:
X_corr = X[unique_names].copy()
Y_corr = Y

In [None]:
x_train_corr, x_test_corr, y_train_corr, y_test_corr = train_test_split(X_corr, Y_corr, test_size=0.30, random_state=42)

dtree_corr = DecisionTreeClassifier()
dtree_corr = dtree_corr.fit(x_train_corr, y_train_corr)

In [None]:
print("Decision Tree Train Accuracy: ", dtree_corr.score(x_train_corr, y_train_corr), "\n")
print("Decision Tree Train Accuracy:", dtree_corr.score(x_test_corr, y_test_corr), "\n")

In [None]:
tree.plot_tree(dtree_corr, feature_names=X_corr.columns, filled=True)

In [None]:
y_pred_corr = dtree_corr.predict(x_test_corr)
ConfusionMatrixDisplay.from_predictions(y_test_corr, y_pred_corr, cmap="Blues")

In [None]:
dtree_corr_report = pd.DataFrame.from_dict(classification_report(y_test_corr, y_pred_corr, target_names = ["Deadn't", "Dead"], output_dict=True)).T
dtree_corr_report

# Forest

In [None]:
random_forest = RandomForestClassifier()
random_forest = random_forest.fit(x_train, y_train)

In [None]:
print("Random Forest Train Accuracy: ", random_forest.score(x_train,y_train), "\n")
print("Random Forest Test Accuracy:", random_forest.score(x_test, y_test), "\n")

In [None]:
y_pred = random_forest.predict(x_test)
c_matrix = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap="Blues")

In [None]:
TP, FN, FP, TN = c_matrix.ravel()
print(TP, FN, FP, TN) 

In [None]:
N, P = y_test.value_counts().ravel()
print(N, P)

In [None]:
error_rate = (FP+FN)/(P+N)
error_rate

In [None]:
accuracy = 1 - error_rate
accuracy

In [None]:
forest_report = pd.DataFrame.from_dict(classification_report(y_test, y_pred, target_names = ["Deadn't", "Dead"], output_dict=True)).T
forest_report

# Forest with most correlated

In [None]:
forest_corr = RandomForestClassifier()
forest_corr = forest_corr.fit(x_train_corr, y_train_corr)

In [None]:
print("Random Forest Train Accuracy: ", forest_corr.score(x_train_corr, y_train_corr), "\n")
print("Random Forest Test Accuracy:", forest_corr.score(x_test_corr, y_test_corr), "\n")

In [None]:
y_pred_corr = forest_corr.predict(x_test_corr)
ConfusionMatrixDisplay.from_predictions(y_test_corr, y_pred_corr, cmap="Blues")

In [None]:
forest_corr_report = pd.DataFrame.from_dict(classification_report(y_test_corr, y_pred_corr, target_names = ["Deadn't", "Dead"], output_dict=True)).T
forest_corr_report

# Logistic Regression

In [None]:
lr = LogisticRegression(solver="saga")

lr.fit(x_train,y_train)
print("Logistic Regression Train Accuracy: ", lr.score(x_train,y_train), "\n")
print("Logistic Regression Test Accuracy: ", lr.score(x_test,y_test))

In [None]:
y_pred = lr.predict(x_test)
lr_report = pd.DataFrame.from_dict(classification_report(y_test, y_pred, target_names = ["Deadn't", "Dead"], output_dict=True)).T
lr_report

In [None]:
reports = {
    "Decision Tree": dtree_report, 
    "Decision Tree (corr)": dtree_corr_report, 
    "Random Forest": forest_report, 
    "Random Forest (corr)": forest_corr_report, 
    "Logistic Regression": lr_report
}
attributes = {
    "Precision dead": ("precision", "Dead"), 
    "Recall dead": ("recall", "Dead"), 
    "Precision deadn't": ("precision", "Deadn't"),
    "Recall deadn't": ("recall", "Deadn't"), 
    "Accuracy": ("f1-score", "accuracy")
}

values = {}

for attribute, index in attributes.items():
    if not attribute in values:
        values[attribute] = []
    for report_value in reports.values():
        values[attribute].append(round(report_value[index[0]][index[1]] * 100, 2))

x = np.arange(len(reports.keys()))  # the label locations
width = 0.1  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout='constrained')
fig.set_figwidth(10)

for attribute, measurement in values.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects, padding=1)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage (%)')
ax.set_title('Report score per model')
ax.set_xticks(x + width, reports.keys())
ax.legend(loc='upper left', ncols=3)
ax.set_ylim(0, 130)

plt.show()

## Prediction

In [None]:
df.columns

In [None]:
jeannine = pd.Series([
    0,          # USMER
    12,         # MEDICAL_UNIT
    1,          # SEX
    0,          # INTUBED
    1,          # PNEUMONIA
    64,         # AGE
    0,          # PREGNANT
    1,          # DIABETES
    0,          # COPD
    0,          # ASTHMA
    1,          # INMSUPR
    1,          # HIPERTENSION
    1,          # OTHER_DISEASE
    0,          # CARDIOVASCULAR
    1,          # OBESITY
    1,          # RENAL_CHRONIC
    1,          # TOBACCO
    3,          # CLASIFFICATION_FINAL
    0,          # ICU
    0,          # HOPSITALIZED
], index=X.columns)
jeannine = jeannine.to_frame().T

for category in categorical:
    if category == "DEAD":
        continue
    jeannine[category].astype("category")

jeannine

In [None]:
jeannine_corr = jeannine[unique_names]
jeannine_corr

In [None]:
x_train.head()

In [None]:
j_pred = {
    "dtree": dtree.predict(jeannine)[0],
    "dtree_corr": dtree_corr.predict(jeannine_corr)[0],
    "forest": random_forest.predict(jeannine)[0],
    "forest_corr": forest_corr.predict(jeannine_corr)[0],
    "lr": lr.predict(jeannine)[0]
}

print(j_pred)