In [128]:
import os
import json
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo
from scipy.stats import skew
import shutil
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler,MinMaxScaler


In [129]:
def download_uci_datasets(dataset_ids: list, directory_path: str):
    """
    Pobiera zestawy danych UCI, zapisuje dane w formacie CSV oraz tworzy metadane w formacie JSON.

    Funkcja wykonuje następujące kroki dla każdego datasetu:
      1. Tworzy katalog docelowy, jeśli nie istnieje.
      2. Pobiera dane (cechy oraz zmienną target) za pomocą funkcji fetch_ucirepo.
      3. Wybiera pierwszą numeryczną kolumnę target, opierając się na metadanych datasetu.
      4. Łączy cechy i wybraną zmienną target w jeden DataFrame.
      5. Zapisuje DataFrame jako plik CSV.
      6. Generuje metadane, zawierające informacje o datasetcie takie jak liczba cech, statystyki targetu, korelacje cech z targetem oraz procent brakujących wartości.
      7. Zapisuje metadane w formacie JSON.

    Dzięki temu funkcja umożliwia wygodne pobieranie oraz przygotowywanie datasetów z repozytorium UCI, gotowych do dalszej analizy i eksperymentów.
    """
    os.makedirs(directory_path, exist_ok=True)

    for dataset_id in dataset_ids:
        dataset = fetch_ucirepo(id=dataset_id)

        X = dataset.data.features
        y = dataset.data.targets

        if isinstance(y, pd.Series):
            y = y.to_frame(name=y.name or 'target')

        # Extract numeric target from metadata
        variables_metadata = dataset.variables

        numeric_target_cols = variables_metadata[
            (variables_metadata['role'] == 'Target') &
            (variables_metadata['type'].isin(['Integer', 'Numeric', 'Real']))
        ]['name'].tolist()

        # Choose the numeric target column explicitly defined in metadata
        if numeric_target_cols:
            target_col = numeric_target_cols[0]
        elif isinstance(y, pd.DataFrame) and y.shape[1] > 1:
            numeric_targets = y.select_dtypes(include=[np.number]).columns
            target_col = numeric_targets[0] if not numeric_targets.empty else y.columns[0]
        else:
            target_col = y.columns[0] if isinstance(y, pd.DataFrame) else 'target'

        y = y[[target_col]]

        # Combine X and y
        df = pd.concat([X, y], axis=1)

        dataset_name = dataset.metadata.name.replace(" ", "_")
        filename_base = f"{dataset_name}_{dataset_id}"
        csv_path = os.path.join(directory_path, f"{filename_base}.csv")
        meta_path = os.path.join(directory_path, f"{filename_base}.meta.json")

        df.to_csv(csv_path, index=False)

        feature_cols = X.columns.tolist()
        numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
        categorical_features = [col for col in feature_cols if col not in numeric_features]

        n_rows, n_cols = df.shape
        null_pct = df.isnull().sum().sum() / df.size * 100

        target_series = y[target_col].dropna()
        if pd.api.types.is_numeric_dtype(target_series):
            target_mean = target_series.mean()
            target_median = target_series.median()
            target_min = target_series.min()
            target_max = target_series.max()
            target_std = target_series.std()
            target_skew = skew(target_series)
            target_cv = target_std / (target_mean + 1e-8)
            outliers = ((target_series - target_mean).abs() > 3 * target_std).sum()
            outlier_ratio = outliers / len(target_series) * 100
            q1 = np.percentile(target_series, 25)
            q3 = np.percentile(target_series, 75)
            iqr = round(q3 - q1, 3)
            unique_vals = target_series.nunique()
        else:
            target_mean = target_median = target_min = target_max = None
            target_std = target_skew = target_cv = None
            outlier_ratio = iqr = unique_vals = None

        corr_with_target = {}
        if target_std is not None:
            for col in numeric_features:
                try:
                    corr = df[[col, target_col]].dropna().corr().iloc[0, 1]
                    corr_with_target[col] = round(corr, 3)
                except:
                    continue

        metadata = {
            "dataset_name": dataset.metadata.name,
            "id": dataset_id,
            "target": target_col,
            "num_observations": n_rows,
            "num_features": len(feature_cols),
            "numeric_features": numeric_features,
            "categorical_features": categorical_features,
            "%_null_values": round(null_pct, 2),
            "target_mean": round(target_mean, 3) if target_mean is not None else None,
            "target_median": round(target_median, 3) if target_median is not None else None,
            "target_min": target_min,
            "target_max": target_max,
            "target_std_dev": round(target_std, 3) if target_std is not None else None,
            "target_skewness": round(target_skew, 3) if target_skew is not None else None,
            "target_coefficient_of_variation": round(target_cv, 3) if target_cv is not None else None,
            "target_outlier_%": round(outlier_ratio, 2) if outlier_ratio is not None else None,
            "target_IQR": iqr,
            "target_unique_values": unique_vals,
            "feature_target_correlations": corr_with_target,
            "source": "UCI"
        }

        def convert(o):
            if isinstance(o, (np.generic, np.ndarray)):
                return o.item() if hasattr(o, 'item') else o.tolist()
            return o

        metadata = {k: convert(v) for k, v in metadata.items()}

        with open(meta_path, "w") as f:
            json.dump(metadata, f, indent=2)

        print(f"✔ Zapisano {csv_path}")


In [130]:
# dataset = fetch_ucirepo(id=320)

In [131]:
# dataset['variables']

In [132]:
def generate_dataset_summary_table(input_datasets_directory: str, output_table_directory: str):
    """
    Funkcja generuje tabelę zbiorczą podsumowującą metadane datasetów, których pliki metadanych (.meta.json)
    znajdują się w katalogu input_datasets_directory.

    Działanie funkcji:
      1. Tworzy katalog wyjściowy (output_table_directory), jeśli nie istnieje.
      2. Iteruje przez wszystkie pliki .meta.json w katalogu wejściowym.
      3. Dla każdego pliku metadanych:
         - Wczytuje metadane zapisane w formacie JSON.
         - Wyodrębnia istotne informacje, takie jak nazwa datasetu, źródło, liczba obserwacji, liczba cech,
           liczba cech numerycznych i kategorycznych, procent brakujących wartości oraz statystyki zmiennej target.
         - Dla cech numerycznych, które korelują ze zmienną target, określa tę o największej bezwzględnej wartości
           korelacji (tzw. Top Correlated Feature).
      4. Łączy zebrane informacje w jeden DataFrame.
      5. Sortuje tabelę według liczby obserwacji oraz liczby cech.
      6. Zapisuje finalną tabelę podsumowującą w formacie CSV w katalogu output_table_directory.
    """
    os.makedirs(output_table_directory, exist_ok=True)

    summary = []

    for file in os.listdir(input_datasets_directory):
        if file.endswith(".meta.json"):
            meta_path = os.path.join(input_datasets_directory, file)

            with open(meta_path) as f:
                metadata = json.load(f)

            corr_dict = metadata.get("feature_target_correlations", {})
            if corr_dict:
                top_corr_feature = max(corr_dict.items(), key=lambda x: abs(x[1]))
                top_corr_name = top_corr_feature[0]
                top_corr_value = top_corr_feature[1]
            else:
                top_corr_name = None
                top_corr_value = None

            summary.append({
                "Dataset Name": metadata.get("dataset_name"),
                "Source": metadata.get("source", "OpenML"),
                "ID/Code": metadata.get("code") or metadata.get("id"),
                "Num Observations": metadata.get("num_observations"),
                "Num Features": metadata.get("num_features"),
                "Categorical Features": len(metadata.get("categorical_features", [])),
                "Numeric Features": len(metadata.get("numeric_features", [])),
                "% Null Values": metadata.get("%_null_values"),
                "Target": metadata.get("target"),
                "Target Mean": metadata.get("target_mean"),
                "Target Median": metadata.get("target_median"),
                "Target Min": metadata.get("target_min"),
                "Target Max": metadata.get("target_max"),
                "Target Std Dev": metadata.get("target_std_dev"),
                "Target IQR": metadata.get("target_IQR"),
                "Target Skewness": metadata.get("target_skewness"),
                "Target Outlier %": metadata.get("target_outlier_%"),
                "Target CV": metadata.get("target_coefficient_of_variation"),
                "Target Unique Values": metadata.get("target_unique_values"),
                "Top Correlated Feature": top_corr_name,
                "Top Correlation Value": top_corr_value,
                "Domain": metadata.get("domain", "unknown")
            })

    df_summary = pd.DataFrame(summary)
    df_summary.sort_values(by=["Num Observations", "Num Features"], inplace=True)

    output_path = os.path.join(output_table_directory, "datasets_summary_table.csv")
    df_summary.to_csv(output_path, index=False)
    print(f"📊 Tabela zbiorcza zapisana w: {output_path}")


In [133]:
def preprocess_dataset_regression(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    """
    Preprocessing dataset dla problemu regresji.
    Wykonywane kroki:
      1. Usunięcie duplikatów.
      2. Usunięcie kolumn (poza targetem) zawierających tylko jedną unikalną wartość.
      3. Imputacja brakujących wartości:
         - cechy numeryczne: średnia,
         - cechy kategoryczne: wartość 'Missing'.
      4. One-hot encoding cech kategorycznych.
      5. Minmax wszystkich cech (po transformacjach).
    """
    # Krok 1: Usunięcie duplikatów
    df = df.drop_duplicates()

    # Krok 2: Usunięcie kolumn z jedną unikalną wartością (pomijamy target)
    cols_to_drop = [col for col in df.columns if col != target_col and df[col].nunique() == 1]
    df = df.drop(columns=cols_to_drop)
    
    # Oddzielenie cech od zmiennej docelowej
    y = df[target_col]
    X = df.drop(columns=[target_col])
    
    # Rozpoznanie kolumn numerycznych i kategorycznych
    num_cols = X.select_dtypes(include=['number']).columns.tolist()
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Budowa transformera:
    # - Cechy numeryczne: imputacja średnią.
    # - Cechy kategoryczne: imputacja braków stałą wartością 'Missing' + one-hot encoding.
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='mean'), num_cols),
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
                ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
            ]), cat_cols)
        ]
    )
    
    # Pipeline: najpierw transformacja, i minmax scaler
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', MinMaxScaler())
    ])
    
    # Dopasowanie i transformacja cech
    X_processed = pipeline.fit_transform(X)
    
    # Odtworzenie nazw kolumn
    new_columns = []
    new_columns.extend(num_cols)
    if cat_cols:
        onehot = pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
        new_cat_cols = onehot.get_feature_names_out(cat_cols)
        new_columns.extend(new_cat_cols)
    
    X_processed = pd.DataFrame(X_processed, columns=new_columns, index=X.index)
    
    # Połączenie przetworzonych cech z oryginalną kolumną target
    df_processed = X_processed.copy()
    df_processed[target_col] = y
    
    return df_processed

In [134]:
def preprocess_datasets(input_path: str, output_path: str):
    """
    Funkcja przechodzi przez wszystkie pliki CSV w katalogu input_path.
    Dla każdego pliku:
      - Odczytuje odpowiadający plik metadanych (.meta.json) i wyciąga nazwę kolumny target.
      - Wczytuje dane z CSV.
      - Przetwarza dane za pomocą funkcji preprocess_dataset_regression.
      - Zapisuje wynikowy DataFrame jako CSV do katalogu output_path przy zachowaniu oryginalnej nazwy pliku.
      - Kopiuje również plik metadanych (.meta.json) do katalogu wyjściowego.
    """
    os.makedirs(output_path, exist_ok=True)
    
    for file in os.listdir(input_path):
        if file.endswith(".csv"):
            csv_path = os.path.join(input_path, file)
            base_name = os.path.splitext(file)[0]
            meta_filename = base_name + ".meta.json"
            meta_path = os.path.join(input_path, meta_filename)
            
            if not os.path.exists(meta_path):
                print(f"Brak pliku metadanych dla {file}. Pomijam plik.")
                continue
            
            # Wczytanie metadanych
            with open(meta_path, "r") as f:
                metadata = json.load(f)
            
            target_col = metadata.get("target")
            if not target_col:
                print(f"Nie znaleziono nazwy kolumny target w metadanych dla {file}. Pomijam plik.")
                continue
            
            # Wczytanie danych
            try:
                df = pd.read_csv(csv_path)
            except Exception as e:
                print(f"Nie udało się wczytać pliku {csv_path}: {e}")
                continue
            
            # Przetwarzanie danych
            try:
                df_processed = preprocess_dataset_regression(df, target_col)
            except Exception as e:
                print(f"Błąd przy przetwarzaniu {csv_path}: {e}")
                continue
            
            # Zapis przetworzonego CSV
            output_csv_path = os.path.join(output_path, file)
            try:
                df_processed.to_csv(output_csv_path, index=False)
                print(f"✔ Przetworzono i zapisano: {output_csv_path}")
            except Exception as e:
                print(f"Nie udało się zapisać pliku {output_csv_path}: {e}")
                continue
            
            # Kopiowanie pliku metadanych do katalogu wyjściowego
            output_meta_path = os.path.join(output_path, meta_filename)
            try:
                shutil.copy(meta_path, output_meta_path)
                print(f"📁 Skopiowano metadane: {output_meta_path}")
            except Exception as e:
                print(f"Nie udało się skopiować metadanych dla {file}: {e}")


In [135]:
datasets_ids = [
    189,
    925,
    186,
    320,
    1
]

In [136]:
download_uci_datasets(
    dataset_ids=datasets_ids,
    directory_path="./uci_datasets"
)
generate_dataset_summary_table(
    input_datasets_directory="./uci_datasets",
    output_table_directory="./datasets_summary"
)

✔ Zapisano ./uci_datasets\Parkinsons_Telemonitoring_189.csv
✔ Zapisano ./uci_datasets\Infrared_Thermography_Temperature_925.csv
✔ Zapisano ./uci_datasets\Wine_Quality_186.csv
✔ Zapisano ./uci_datasets\Student_Performance_320.csv
✔ Zapisano ./uci_datasets\Abalone_1.csv
📊 Tabela zbiorcza zapisana w: ./datasets_summary\datasets_summary_table.csv


In [137]:
summary_df = pd.read_csv("datasets_summary/datasets_summary_table.csv")
summary_df.head()

Unnamed: 0,Dataset Name,Source,ID/Code,Num Observations,Num Features,Categorical Features,Numeric Features,% Null Values,Target,Target Mean,...,Target Max,Target Std Dev,Target IQR,Target Skewness,Target Outlier %,Target CV,Target Unique Values,Top Correlated Feature,Top Correlation Value,Domain
0,Student Performance,UCI,320,649,30,17,13,0.0,G3,11.906,...,19.0,3.231,4.0,-0.911,2.47,0.271,17,failures,-0.393,unknown
1,Infrared Thermography Temperature,UCI,925,1020,33,3,30,0.01,aveOralF,36.979,...,39.6,0.386,0.3,2.537,2.84,0.01,53,T_Max1,0.753,unknown
2,Abalone,UCI,1,4177,8,1,7,0.0,Rings,9.934,...,29.0,3.224,3.0,1.114,1.48,0.325,28,Shell_weight,0.628,unknown
3,Parkinsons Telemonitoring,UCI,189,5875,19,0,19,0.0,motor_UPDRS,21.296,...,39.511,8.129,12.596,0.075,0.0,0.382,1080,age,0.274,unknown
4,Wine Quality,UCI,186,6497,11,0,11,0.0,quality,5.818,...,9.0,0.873,1.0,0.19,0.54,0.15,7,alcohol,0.444,unknown


In [138]:
preprocess_datasets(
    input_path="./uci_datasets",
    output_path="./uci_datasets_preprocessed"
)

✔ Przetworzono i zapisano: ./uci_datasets_preprocessed\Abalone_1.csv
📁 Skopiowano metadane: ./uci_datasets_preprocessed\Abalone_1.meta.json
✔ Przetworzono i zapisano: ./uci_datasets_preprocessed\Infrared_Thermography_Temperature_925.csv
📁 Skopiowano metadane: ./uci_datasets_preprocessed\Infrared_Thermography_Temperature_925.meta.json
✔ Przetworzono i zapisano: ./uci_datasets_preprocessed\Parkinsons_Telemonitoring_189.csv
📁 Skopiowano metadane: ./uci_datasets_preprocessed\Parkinsons_Telemonitoring_189.meta.json
✔ Przetworzono i zapisano: ./uci_datasets_preprocessed\Student_Performance_320.csv
📁 Skopiowano metadane: ./uci_datasets_preprocessed\Student_Performance_320.meta.json
✔ Przetworzono i zapisano: ./uci_datasets_preprocessed\Wine_Quality_186.csv
📁 Skopiowano metadane: ./uci_datasets_preprocessed\Wine_Quality_186.meta.json


In [144]:
df = pd.read_csv('./uci_datasets_preprocessed/Parkinsons_Telemonitoring_189.csv')

In [149]:
df.head()

Unnamed: 0,age,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,sex,motor_UPDRS
0,0.734694,0.045076,0.05839,0.071164,0.064324,0.039635,0.064433,0.085062,0.09803,0.079287,0.067543,0.051764,0.079267,0.018723,0.551717,0.328638,0.097793,0.194544,0.0,28.199
1,0.734694,0.077034,0.021884,0.032819,0.017305,0.015478,0.017303,0.064691,0.073522,0.05172,0.053186,0.052753,0.051699,0.014474,0.704771,0.34833,0.1443,0.121335,0.0,28.447
2,0.734694,0.108957,0.040137,0.050413,0.030065,0.023868,0.030178,0.051549,0.074483,0.035577,0.039375,0.044291,0.035556,0.026651,0.590568,0.381812,0.085362,0.265104,0.0,28.695
3,0.734694,0.136105,0.044877,0.054924,0.027618,0.031969,0.027673,0.075423,0.144642,0.058674,0.064878,0.062791,0.058632,0.036834,0.629169,0.412583,0.181761,0.437884,0.0,28.905
4,0.734694,0.172487,0.025413,0.040263,0.010488,0.012585,0.010486,0.052604,0.072081,0.032162,0.044524,0.057515,0.032121,0.01516,0.675585,0.393664,0.134202,0.241814,0.0,29.187
