In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import ipytest
from pathlib import Path
import logging
from datetime import datetime
import ipynbname
from typing import Optional

In [2]:
# Configure the logging settings
logging.basicConfig(
    format='%(asctime)s %(levelname)s %(message)s',  # Set the log message format
    datefmt='%Y-%m-%d %H:%M:%S',  # Set the date format
    level=logging.INFO  # Set the logging level to INFO
)

# Create a logger object
logger = logging.getLogger(__name__)

logger.info("Logger has been configured successfully.")

2024-11-04 19:36:07 INFO Logger has been configured successfully.


In [3]:
ipytest.autoconfig()

# Константы

In [5]:
ROOT = Path(".").absolute().parent.parent

DATA = ROOT / "data"
DATA_RAW = DATA / "raw"

# Вспомогательный код

In [7]:
def get_timestamp() -> str:
    """
    Generates a timestamp string in the format YYYY-MM-DD-HH-MM-SS.
    
    Returns:
        str: The formatted timestamp.
    """
    dt = datetime.now()
    return dt.strftime("%Y-%m-%d-%H-%M-%S")

def get_filename(
        fname: Optional[str] = None,
        extension: str = 'parquet',
    ) -> str:
    """
    Generates a filename based on the provided name and current timestamp.
    
    Args:
        fname (Optional[str]): The base filename. If None, the current notebook name is used.
        extension (str): The file extension to use. Default is 'parquet'.
    
    Returns:
        str: The generated filename with timestamp and extension.
    """
    base_filename: str = fname if fname else ipynbname.name()
    return f"{base_filename}_{get_timestamp()}.{extension}"

# Загрузка данных

In [9]:
def fetch_wine_quality_data():
    """
    Fetches the wine quality dataset from the UCI Machine Learning Repository
    and returns a single DataFrame containing both features and targets.
    """
    # fetch dataset 
    wine_quality = fetch_ucirepo(id=186)  # Retrieve the wine quality dataset using its unique identifier
    
    # data (as pandas dataframes) 
    wine_quality_df = wine_quality.data.features.copy()  # Store the feature data in variable X
    targets = wine_quality.data.targets    # Store the target data in variable y
    
    wine_quality_df['quality'] = targets.values
    return wine_quality_df  # Return the combined DataFrame



In [10]:
df = fetch_wine_quality_data()
logging.info("Data loaded. %s  rows, %s columns", df.shape[0], df.shape[1])

2024-11-04 19:36:13 INFO Data loaded. 6497  rows, 12 columns


In [11]:
# Данные содержат дубликаты: избавимся от них
df = df.assign(rc=1).groupby(df.columns.tolist(), as_index=False).rc.sum()

logging.info("Duplicates removed. %s  rows, %s columns", df.shape[0], df.shape[1])

2024-11-04 19:36:13 INFO Duplicates removed. 5318  rows, 13 columns


# Проверки

In [13]:
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,rc
0,3.8,0.31,0.02,11.1,0.036,20.0,114.0,0.99248,3.75,0.44,12.4,6,1
1,3.9,0.225,0.4,4.2,0.03,29.0,118.0,0.989,3.57,0.36,12.8,8,1
2,4.2,0.17,0.36,1.8,0.029,93.0,161.0,0.98999,3.65,0.89,12.0,7,1
3,4.2,0.215,0.23,5.1,0.041,64.0,157.0,0.99688,3.42,0.44,8.0,3,1
4,4.4,0.32,0.39,4.3,0.03,31.0,127.0,0.98904,3.46,0.36,12.8,8,1


In [14]:
%%ipytest

def test_no_duplicates():
    assert df.duplicated().sum() == 0

[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 0.01s[0m[0m


# Сохранение

In [16]:
fname = get_filename()

logger.info("Saving data to a file: %s", fname)
try:
    df.to_parquet(DATA_RAW / fname)
    logger.info("Data saved to %s", DATA_RAW)
except:
    logger.error("Error. Data was not saved.")

2024-11-04 19:36:13 INFO Saving data to a file: wine_data_2024-11-04-19-36-13.parquet
2024-11-04 19:36:14 INFO Data saved to E:\edu\wine_project\data\raw
