In [1]:
import os

import pandas as pd
from matplotlib import pyplot as plt

from source.utils.helper import bar_plot

In [2]:
data_path = os.path.join("..", "data")

In [None]:
df = pd.read_csv(
    os.path.join(data_path, "data_cocktails.csv"),
    index_col=0,
    encoding="utf-8",
    dtype={
        "index": int,
        "strDrink": str,
        "strCategory": str,
        "strGlass": str,
        "strIngredients": str,
        "Alc_type": str,
        "Basic_taste": str,
        "strInstructions": str,
        "strMeasures": str,
        "Value_ml": float,
        "Value_gr": float,
        "Garnish_amount": str,
        "Garnish_type": str,
    },
)

In [None]:
df.head()

## Categories distribution
### Drink category

In [None]:
bar_plot(df, "strCategory")

### Type of glass

In [None]:
bar_plot(df, "strGlass")

### Alcohol type

In [None]:
bar_plot(df, "Alc_type")

### Basic taste

In [None]:
bar_plot(df, "Basic_taste")

### Garnish types

In [None]:
bar_plot(df, "Garnish_type")

## Clean data

### Clean Garnish_amount column

In [None]:
df["Garnish_amount"].unique()

Replace non-numerical measures in `Garnish_amount` and transform the column into float values.

In [None]:
df["Garnish_amount"].replace("1/2", "0.5", inplace=True)
df["Garnish_amount"] = df["Garnish_amount"].astype("float")

### Treat NaNs

In [None]:
df.isna().sum().plot.bar()
plt.show()

Fill all NaN values with 0 in the numerical columns.

In [None]:
numerical_cols = df.select_dtypes(include="number").columns
df[numerical_cols] = df.loc[:, numerical_cols].fillna(0.0)

## Save to file

In [None]:
df.to_pickle(os.path.join(data_path, "processed-cocktails-data.pkl"))