In [2]:
import os
import json
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo
from scipy.stats import skew
import shutil
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler,MinMaxScaler


In [3]:
def download_uci_datasets(dataset_ids: list, directory_path: str):
    os.makedirs(directory_path, exist_ok=True)

    for dataset_id in dataset_ids:
        dataset = fetch_ucirepo(id=dataset_id)

        df = dataset.data.original.copy()

        dataset_name = dataset.metadata.name.replace(" ", "_")
        filename_base = f"{dataset_name}_{dataset_id}"
        csv_path = os.path.join(directory_path, f"{filename_base}.csv")
        meta_path = os.path.join(directory_path, f"{filename_base}.meta.json")

        # Zapis danych .csv
        df.to_csv(csv_path, index=False)

        # Wyciągamy info z metadanych
        variables = dataset.variables
        features = variables[variables["role"] == "Feature"]["name"].tolist()
        targets = variables[variables["role"] == "Target"]["name"].tolist()
        others = variables[variables["role"] == "Other"]["name"].tolist()

        metadata = {
            "dataset_name": dataset.metadata.name,
            "id": int(dataset_id),
            "num_rows": int(df.shape[0]),
            "num_columns": int(df.shape[1]),
            "columns": df.columns.tolist(),
            "features": features,
            "targets": targets,
            "others": others,
            "source": "UCI"
        }

        # Bezpieczna konwersja typów do JSON
        def convert(o):
            if isinstance(o, (np.generic, np.ndarray)):
                return o.item() if hasattr(o, 'item') else o.tolist()
            return o

        metadata = {k: convert(v) for k, v in metadata.items()}

        with open(meta_path, "w") as f:
            json.dump(metadata, f, indent=2)

        print(f"✔ Saved CSV: {csv_path}")
        print(f"✔ Saved Metadata: {meta_path}")


In [4]:
def generate_dataset_summary_table(input_datasets_directory: str, output_table_directory: str):
    """
    Funkcja generuje tabelę zbiorczą podsumowującą metadane datasetów, których pliki metadanych (.meta.json)
    znajdują się w katalogu input_datasets_directory.

    Działanie funkcji:
      1. Tworzy katalog wyjściowy (output_table_directory), jeśli nie istnieje.
      2. Iteruje przez wszystkie pliki .meta.json w katalogu wejściowym.
      3. Dla każdego pliku metadanych:
         - Wczytuje metadane zapisane w formacie JSON.
         - Wyodrębnia istotne informacje, takie jak nazwa datasetu, źródło, liczba obserwacji, liczba cech,
           liczba cech numerycznych i kategorycznych, procent brakujących wartości oraz statystyki zmiennej target.
         - Dla cech numerycznych, które korelują ze zmienną target, określa tę o największej bezwzględnej wartości
           korelacji (tzw. Top Correlated Feature).
      4. Łączy zebrane informacje w jeden DataFrame.
      5. Sortuje tabelę według liczby obserwacji oraz liczby cech.
      6. Zapisuje finalną tabelę podsumowującą w formacie CSV w katalogu output_table_directory.
    """
    os.makedirs(output_table_directory, exist_ok=True)

    summary = []

    for file in os.listdir(input_datasets_directory):
        if file.endswith(".meta.json"):
            meta_path = os.path.join(input_datasets_directory, file)

            with open(meta_path) as f:
                metadata = json.load(f)

            corr_dict = metadata.get("feature_target_correlations", {})
            if corr_dict:
                top_corr_feature = max(corr_dict.items(), key=lambda x: abs(x[1]))
                top_corr_name = top_corr_feature[0]
                top_corr_value = top_corr_feature[1]
            else:
                top_corr_name = None
                top_corr_value = None

            summary.append({
                "Dataset Name": metadata.get("dataset_name"),
                "Source": metadata.get("source", "OpenML"),
                "ID/Code": metadata.get("code") or metadata.get("id"),
                "Num Observations": metadata.get("num_observations"),
                "Num Features": metadata.get("num_features"),
                "Categorical Features": len(metadata.get("categorical_features", [])),
                "Numeric Features": len(metadata.get("numeric_features", [])),
                "% Null Values": metadata.get("%_null_values"),
                "Target": metadata.get("target"),
                "Target Mean": metadata.get("target_mean"),
                "Target Median": metadata.get("target_median"),
                "Target Min": metadata.get("target_min"),
                "Target Max": metadata.get("target_max"),
                "Target Std Dev": metadata.get("target_std_dev"),
                "Target IQR": metadata.get("target_IQR"),
                "Target Skewness": metadata.get("target_skewness"),
                "Target Outlier %": metadata.get("target_outlier_%"),
                "Target CV": metadata.get("target_coefficient_of_variation"),
                "Target Unique Values": metadata.get("target_unique_values"),
                "Top Correlated Feature": top_corr_name,
                "Top Correlation Value": top_corr_value,
                "Domain": metadata.get("domain", "unknown")
            })

    df_summary = pd.DataFrame(summary)
    df_summary.sort_values(by=["Num Observations", "Num Features"], inplace=True)

    output_path = os.path.join(output_table_directory, "datasets_summary_table.csv")
    df_summary.to_csv(output_path, index=False)
    print(f"📊 Tabela zbiorcza zapisana w: {output_path}")


In [5]:
datasets_ids = [
    1,
    374,
    320,
    294,
    186,
    242
]

In [119]:
download_uci_datasets(
    dataset_ids=datasets_ids,
    directory_path="./uci_datasets"
)
generate_dataset_summary_table(
    input_datasets_directory="./uci_datasets",
    output_table_directory="./datasets_summary"
)

✔ Saved CSV: ./uci_datasets\Wine_Quality_186.csv
✔ Saved Metadata: ./uci_datasets\Wine_Quality_186.meta.json
📊 Tabela zbiorcza zapisana w: ./datasets_summary\datasets_summary_table.csv


In [15]:
def custom_one_hot_encode(df, columns):
    df_encoded = df.copy()
    for col in columns:
        unique_vals = df_encoded[col].nunique()
        if unique_vals == 2:
            # Binarna kolumna – drop first
            dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=True,dtype=int)
        else:
            # Wielokategorialna kolumna – zachowaj wszystkie kolumny
            dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=False,dtype=int)
        
        # Dołącz i usuń oryginalną kolumnę
        df_encoded = pd.concat([df_encoded.drop(columns=[col]), dummies], axis=1)
    
    return df_encoded

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# === INPUTS ===
target_col = ''  # nazwa kolumny target
columns_to_drop = []  # kolumny do usunięcia
columns_to_label_fill = []  # kolumny, gdzie zamieniamy NaN na nowy label
label_fill_value = 'missing'  # nowa wartość do podstawienia
columns_to_mean_fill = []  # kolumny, gdzie uzupełniamy NaN średnią
columns_to_onehot = []

def preprocess_dataframe(df: pd.DataFrame,log_path) -> pd.DataFrame:
    # 1. Drop specified columns
    df = df.drop(columns=columns_to_drop, errors='ignore')

    # 2. Drop duplicate rows
    df = df.drop_duplicates()

    # 3. Drop columns with constant value
    nunique = df.nunique()
    constant_cols = nunique[nunique <= 1].index.tolist()
    df = df.drop(columns=constant_cols)

    # 4. Replace NaNs in specified columns with label
    for col in columns_to_label_fill:
        if col in df.columns:
            df[col] = df[col].fillna(label_fill_value)

    # 5. Replace NaNs in other specified columns with mean
    for col in columns_to_mean_fill:
        if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(df[col].mean())

    # 6. One-hot encode specified columns
    # df = pd.get_dummies(df, columns=[col for col in columns_to_onehot if col in df.columns], drop_first=False,dtype=int)
    df = custom_one_hot_encode(df, columns_to_onehot)

    # 7. Standardize all features
    numeric_cols = df.select_dtypes(include=np.number).columns.difference([target_col])
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    with open(log_path, "w") as f:
        f.write('0. Target column: {}\n\n'.format(target_col))

        f.write("1. Dropped columns:\n")
        f.write(", ".join(columns_to_drop) + "\n\n")

        f.write("2. Columns with constant values (removed after checking):\n")
        f.write(", ".join(constant_cols) + "\n\n")

        f.write("3. Columns with missing values filled with label '{}':\n".format(label_fill_value))
        f.write(", ".join(columns_to_label_fill) + "\n\n")

        f.write("4. Columns with missing values filled with mean:\n")
        f.write(", ".join(columns_to_mean_fill) + "\n\n")

        f.write("5. Columns one-hot encoded:\n")
        f.write(", ".join(columns_to_onehot) + "\n\n")


    return df


In [27]:
dataset_name = "Wine_Quality_186"
df = pd.read_csv(f"./uci_datasets/{dataset_name}.csv")
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [28]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

# Print w formacie do łatwego kopiowania
print("categorical_cols :")
print(categorical_cols)
print()
print("numerical_cols :")
print(numerical_cols)

categorical_cols :
['color']

numerical_cols :
['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']


In [29]:
# Target column
target_col = 'quality'

# Columns to drop
columns_to_drop = []

# Columns with missing values filled with a label
columns_to_label_fill = ['color']
label_fill_value = 'missing'

# Columns with missing values filled with the mean
columns_to_mean_fill = [
    'fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
    'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide',
    'density', 'pH', 'sulphates', 'alcohol'
]

# Columns to one-hot encode
columns_to_onehot = ['color']


In [30]:
preprocessed_df = preprocess_dataframe(df, log_path=f"./uci_datasets_logs/{dataset_name}_preprocessing_log.txt")
preprocessed_df.to_csv(f"./uci_datasets_preprocessed/{dataset_name}_preprocessed.csv", index=False)
preprocessed_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color_white
0,0.140064,2.115349,-2.164515,-0.699699,0.52388,-1.069272,-1.411143,1.100996,1.779304,0.177941,-0.969152,5,-1.707233
1,0.443199,3.185297,-2.164515,-0.544135,1.120736,-0.282905,-0.829839,0.763753,-0.153797,0.979389,-0.631833,5,-1.707233
2,0.443199,2.471998,-1.892672,-0.610806,0.957957,-0.844596,-1.058837,0.831202,0.220351,0.779027,-0.631833,5,-1.707233
3,3.019841,-0.381197,1.641293,-0.699699,0.496751,-0.732258,-0.953146,1.168444,-0.403229,0.311515,-0.631833,6,-1.707233
5,0.140064,1.877583,-2.164515,-0.721923,0.496751,-0.956934,-1.305451,1.100996,1.779304,0.177941,-0.969152,5,-1.707233


In [None]:
# Target column last

In [6]:
import os
os.getcwd()
# os.chdir('..')

'c:\\Users\\bartekb\\Desktop\\sem6\\inter\\projekt\\pi_br_greater_than_1'

In [36]:
dataset_name = "Student_Performance_320"
target_col = 'G3'

df = pd.read_csv(f'./processed_data/preprocessed_bb_scaled/{dataset_name}_preprocessed.csv')
df.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,1.031695,1.310216,1.540715,0.576718,0.083653,-0.374305,0.072606,-0.171647,0.693785,-0.543555,...,0.652973,-0.259681,2.923032,-1.259229,-0.252853,-0.97114,0.495663,0.344914,-1.816043,-0.763496
1,0.210137,-1.336039,-1.188832,-0.760032,0.083653,-0.374305,1.119748,-0.171647,-0.15738,-0.543555,...,-1.531457,-0.259681,-0.34211,0.794137,-0.252853,-0.97114,-2.017502,0.344914,0.550648,-0.763496
2,-1.43298,-1.336039,-1.188832,-0.760032,0.083653,-0.374305,0.072606,-0.171647,-1.008546,0.538553,...,0.652973,-0.259681,2.923032,-1.259229,-0.252853,-0.97114,0.495663,0.344914,0.550648,-0.763496
3,-1.43298,1.310216,-0.278983,-0.760032,1.290114,-0.374305,-0.974536,-1.123771,-1.008546,-0.543555,...,0.652973,-0.259681,-0.34211,0.794137,-0.252853,1.029717,0.495663,0.344914,0.550648,1.309764
4,-0.611422,0.428131,0.630866,-0.760032,0.083653,-0.374305,0.072606,-0.171647,-1.008546,-0.543555,...,-1.531457,-0.259681,-0.34211,0.794137,-0.252853,-0.97114,0.495663,0.344914,-1.816043,-0.763496


In [37]:
if target_col in df.columns:
    df = df[[col for col in df.columns if col != target_col] + [target_col]]

In [38]:
df.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes,G3
0,1.031695,1.310216,1.540715,0.576718,0.083653,-0.374305,0.072606,-0.171647,0.693785,-0.543555,...,-0.259681,2.923032,-1.259229,-0.252853,-0.97114,0.495663,0.344914,-1.816043,-0.763496,11
1,0.210137,-1.336039,-1.188832,-0.760032,0.083653,-0.374305,1.119748,-0.171647,-0.15738,-0.543555,...,-0.259681,-0.34211,0.794137,-0.252853,-0.97114,-2.017502,0.344914,0.550648,-0.763496,11
2,-1.43298,-1.336039,-1.188832,-0.760032,0.083653,-0.374305,0.072606,-0.171647,-1.008546,0.538553,...,-0.259681,2.923032,-1.259229,-0.252853,-0.97114,0.495663,0.344914,0.550648,-0.763496,12
3,-1.43298,1.310216,-0.278983,-0.760032,1.290114,-0.374305,-0.974536,-1.123771,-1.008546,-0.543555,...,-0.259681,-0.34211,0.794137,-0.252853,1.029717,0.495663,0.344914,0.550648,1.309764,14
4,-0.611422,0.428131,0.630866,-0.760032,0.083653,-0.374305,0.072606,-0.171647,-1.008546,-0.543555,...,-0.259681,-0.34211,0.794137,-0.252853,-0.97114,0.495663,0.344914,-1.816043,-0.763496,13


In [39]:
df.to_csv(f'./processed_data/preprocessed_bb_scaled/{dataset_name}_preprocessed.csv', index=False)