## Importation of important libraries

In [None]:
RD_STATE=42
Z_ALPHA = 0.5

import kagglehub
import pandas as pd

import warnings 
warnings.filterwarnings("ignore")

## Get the datasets from kaggle

> By doing so, you can get the datasets only with the code

In [None]:
# Download latest version
sch_path = kagglehub.dataset_download("uom190346a/global-coffee-health-dataset")

print("Path to dataset files:", sch_path)

In [None]:
# Import dataset
sch_db = pd.read_csv(sch_path + '/synthetic_coffee_health_10000.csv')
sch_db

# DataViz

In [None]:
# Check the dataset
for col in sch_db.columns:
    print(col, sch_db[col].isnull().sum(), end=" ")
    print(sch_db[col].unique())

# Check dataset
sch_db.dtypes

## **Data Cleaning**
* Goal: Fix or remove incorrect, corrupted, or incomplete data.
* Typical Tasks:
    * Handling missing values (e.g., imputation or deletion)
        * Done (removed NaN in Health Issues column)
    * Removing duplicates and irrelevant variables
        * Done (removed ID column)
    * Fixing data entry errors (e.g., inconsistent capitalization or typos)
        * Done (none)
    * Correcting inconsistencies (e.g., "USA" vs. "United States") and incomplete values
        * Done (none)
    * Handling outliers (depending on the use case)
        * TODO


#### 1. Handling missing values

In [None]:
# Check whether there is missing values or not
sch_db.isnull().sum()

In [None]:
# There are missing values in 'Health Issues'
# Let's assume rows without data in 'Health Issues' represent a person with good health
sch_db['Health_Issues'].fillna('No', inplace=True)
print(sch_db['Health_Issues'].unique())

#### 2. Removing duplicates and irrelevant variables

In [None]:
# Delete the "other" gender since non-sense
sch_db = sch_db[sch_db['Gender'] != 'Other']
# Severe is deleting it is too "niche"
sch_db = sch_db[sch_db['Health_Issues'] != 'Severe']

In [None]:
# Here, the 'ID' column is irrelevant for the ML algorithm, so we can just drop it
sch_db = sch_db.drop(["ID"], axis=1)

sch_db

#### 3.4.5. Fixing data entry errors, Inconsistencies, Handling outliers

In [None]:
# Let's check whether there are data entry errors, inconsistencies or outliers in the DB
for col in sch_db.columns:
    print("=====" + col + "=====")
    if len(x:= sch_db[col].value_counts()) < 50:
        print(x)
    print(sch_db[col].describe())
    print("\n")
    
# high Physical_Activity_Hours !

# It seems there are no mistypes values, no inconsistencies and not outliers

## **Data Preprocessing**

* Goal: Prepare raw data for modelling or analysis.
* Includes data cleaning, plus additional transformations, such as:
    * Encoding categorical variables (e.g., one-hot encoding)
    * Feature scaling (e.g., normalization, standardization)
    * Feature selection/extraction
    * Data transformation (e.g., log transformations, binning)
    * Handling imbalanced datasets

In [None]:
# Erase z_alpha case

In [None]:
# Check class balance
sch_db['Health_Issues'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%'

In [None]:
# Fetch dataset for testing around 10%
test_db = sch_db.sample(frac=0.1, random_state=RD_STATE) 
test_db.to_csv("./test_data.csv")
test_db

In [None]:
# Delete the records in the main db
try:
    sch_db = sch_db.drop(test_db.index)
except KeyError:
    print("Rows already delted")
sch_db.to_csv("./training_data.csv")
sch_db