In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from feature_engine.imputation import RandomSampleImputer
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport

# Data Cleaning

## Objectives

- Evaluate missing data
- Clean the data

## Inputs

- Location of the heart.csv dataset: `/workspace/Heart_attack_risk/outputs/datasets/collection/heart.csv`

## Outputs

- Generate cleaned Train and Test sets at the path outputs/datasets/cleaned

## Conclusions

- Data cleaning pipeline

---

## Setting working Directory

The following steps allow you to set "heart_attack_risk" as the new working directory:

- Get the current directory and print it


In [None]:
current_dir = os.getcwd()

- Set the new working directory as the parent of the previous current directory
- As a result, "heart_attack_risk" becomes the new working directory 

In [None]:
os.chdir(os.path.dirname(current_dir))

## Load dataset

In [None]:
df = (pd.read_csv("outputs/datasets/collection/heart.csv"))
df.head(3)

## Data 

The following steps involve checking the shape and distribution of missing data

In [None]:
vars_with_missing_data = df.columns[df.isna().sum() > 0].to_list()
vars_with_missing_data

In [None]:
if vars_with_missing_data:
    profile = ProfileReport(df=df[vars_with_missing_data], minimal=True)
    profile.to_notebook_iframe()
else:
    print("There are no variables with missing data")

It appears that there are no variables with missing data in the dataset.

## Data distributions overview

In [None]:
pandas_report = ProfileReport(df=df, minimal=True)
pandas_report.to_notebook_iframe()

It seems like although the Cholesterol feature has no missing data, the presence of zeros in it represents 18.7% of the data. However, cholesterol blood levels cannot be zero. Thus, the zeros in this context likely represent missing data for cholesterol.


### Assessing 0 values for cholesterol

In [None]:
def calculate_zero_percentage(df):
    """
    Calculate the percentage of zero values in the
    'Cholesterol' column of a DataFrame.
    """
    chol_series = df["Cholesterol"]
    total_dataset = chol_series.count()
    zero_count = (chol_series == 0).sum()
    zero_percentage = (zero_count / total_dataset) * 100
    data_type = chol_series.dtype
    result_df = pd.DataFrame({
        "Percentage of 0s": zero_percentage,
        "RowsWithZero": zero_count,
        "TotalDataset": total_dataset,
        "DataType": data_type
    }, index=["Cholesterol"])

    return result_df

In [None]:
calculate_zero_percentage(df)

The dataset contains zeros, which represent approximately 20% of the total values. Instead of replacing these zeros with median values, I've opted to distribute them uniformly across the distribution

In [None]:
df["Cholesterol"] = df["Cholesterol"].replace(0, np.nan)
imputer = RandomSampleImputer(random_state=42, variables=['Cholesterol'])

Distributing zeros uniformly across the Cholesterol distribution

In [None]:
df = imputer.fit_transform(df)

Plotting the Cholesterol distribution after replacing zeros

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(data=df, x="Cholesterol", kde=True, element="step")
plt.title("Distribution of Cholesterol", fontsize=20, y=1.05)
plt.xlabel("Cholesterol", fontsize=15, labelpad=20)
plt.ylabel("Frequency", fontsize=15, labelpad=20)
plt.grid(True)
plt.show()

## Split Dataset into Train and Test 

In [None]:
TrainSet, TestSet, _, __ = train_test_split(
    df,
    df['HeartDisease'],
    test_size=0.2,
    random_state=0)

print(f"TrainSet shape: {TrainSet.shape} \nTestSet shape: {TestSet.shape}")

### Moving the dataset to a new output folder.

In [None]:
try:
    os.makedirs(name='outputs/datasets/cleaned')
except Exception as e:
    print(e)

#### Train Set

In [None]:
TrainSet.to_csv("outputs/datasets/cleaned/TrainSetCleaned.csv", index=False)

#### Test set

In [None]:
TestSet.to_csv("outputs/datasets/cleaned/TestSetCleaned.csv", index=False)