# Preprocessing

Data sources:
- [Taxi price](https://www.kaggle.com/datasets/denkuznetz/taxi-price-prediction/data?fbclid=IwZXh0bgNhZW0CMTAAAR3EXwfcJrO4pqnY2pId6K4qb2sd01_YLdKmfEKlIZ8bxhvxc2iwQKw01ik_aem_yNsb3V7ymqavEDGZfZZBJA)
- [Flood Prediction](https://www.kaggle.com/datasets/naiyakhalid/flood-prediction-dataset?fbclid=IwZXh0bgNhZW0CMTAAAR31KS5Uu8KLo3FP9L0kwUM1iDdiYWZCD2q3fuxnrhup8UB4D-n5KLOinSg_aem_eziiiOqts_lvLdyq2uftIQ)
- [Airfoil Self-Noise](https://archive.ics.uci.edu/dataset/291/airfoil+self+noise?fbclid=IwZXh0bgNhZW0CMTAAAR1qsj7CYd84jygC5CQnGTuBnMqopmtgGYAFnm3EtJaz0BBv735Dhu2fut8_aem_lBvbJTztOR47e3Vk2nuNOw)
- [Superconductivty Data](https://archive.ics.uci.edu/dataset/464/superconductivty+data?fbclid=IwZXh0bgNhZW0CMTAAAR3dr3Eitf5ZHCekr4hKhaqapt2sgtuSCpSFWogqI5F0o9bnE9edFQro_mg_aem_CYa2w2f-668q3LO-k_DElg)
- [SUPPORT2](https://archive.ics.uci.edu/dataset/880/support2?fbclid=IwZXh0bgNhZW0CMTAAAR3dr3Eitf5ZHCekr4hKhaqapt2sgtuSCpSFWogqI5F0o9bnE9edFQro_mg_aem_CYa2w2f-668q3LO-k_DElg)
- [Metro Interstate Traffic Volume](https://archive.ics.uci.edu/dataset/492/metro+interstate+traffic+volume?fbclid=IwZXh0bgNhZW0CMTAAAR3dr3Eitf5ZHCekr4hKhaqapt2sgtuSCpSFWogqI5F0o9bnE9edFQro_mg_aem_CYa2w2f-668q3LO-k_DElg)
- [BodyFat](https://www.kaggle.com/datasets/fedesoriano/body-fat-prediction-dataset)
- [delta_ailerons](https://www.dcc.fc.up.pt/~ltorgo/Regression/delta_ailerons.html)
- [delta_elevators](https://www.dcc.fc.up.pt/~ltorgo/Regression/delta_elevators.html)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

## Load data

In [None]:
df1 = pd.read_csv('./data/airfoil_self_noise.dat', sep='\\s+')
df1.head()

In [None]:
df2 = pd.read_csv('./data/flood.csv')
df2.head()

In [None]:
df3 = pd.read_csv('./data/Metro_Interstate_Traffic_Volume.csv')
df3.head()

In [None]:
df4 = pd.read_csv("./data/superconductivity.csv")
df4.head()

In [None]:
df5 = pd.read_csv("./data/support2.csv")
df5.head()

In [None]:
df6 = pd.read_csv("./data/taxi_trip_pricing.csv")
df6.head()

In [None]:
df7 = pd.read_csv('./data/bodyfat.csv')
df7.head()

In [None]:
df8 = pd.read_csv('./data/delta_ailerons.data', sep='\\s+')
df8.head()

In [None]:
df9 = pd.read_csv('./data/delta_elevators.data', sep='\\s+')
df9.head()

## Manipulate special columns

### df3 - change datetime and holidays column

In [None]:
def extract_datetime_features(df, datetime_col):
    df_copy = df.copy()
    
    print(f"Processing datetime column: {datetime_col}")
    if not pd.api.types.is_datetime64_any_dtype(df_copy[datetime_col]):
        df_copy[datetime_col] = pd.to_datetime(df_copy[datetime_col], errors='coerce')
    
    df_copy[f'{datetime_col}_year'] = df_copy[datetime_col].dt.year
    df_copy[f'{datetime_col}_month'] = df_copy[datetime_col].dt.month
    df_copy[f'{datetime_col}_day'] = df_copy[datetime_col].dt.day
    df_copy[f'{datetime_col}_hour'] = df_copy[datetime_col].dt.hour
    df_copy[f'{datetime_col}_dayofweek'] = df_copy[datetime_col].dt.dayofweek
    df_copy[f'{datetime_col}_quarter'] = df_copy[datetime_col].dt.quarter
    df_copy[f'{datetime_col}_is_weekend'] = df_copy[datetime_col].dt.dayofweek >= 5
    
    df_copy[f'{datetime_col}_month_sin'] = np.sin(2 * np.pi * df_copy[datetime_col].dt.month / 12)
    df_copy[f'{datetime_col}_month_cos'] = np.cos(2 * np.pi * df_copy[datetime_col].dt.month / 12)
    df_copy[f'{datetime_col}_hour_sin'] = np.sin(2 * np.pi * df_copy[datetime_col].dt.hour / 24)
    df_copy[f'{datetime_col}_hour_cos'] = np.cos(2 * np.pi * df_copy[datetime_col].dt.hour / 24)
    df_copy[f'{datetime_col}_dayofweek_sin'] = np.sin(2 * np.pi * df_copy[datetime_col].dt.dayofweek / 7)
    df_copy[f'{datetime_col}_dayofweek_cos'] = np.cos(2 * np.pi * df_copy[datetime_col].dt.dayofweek / 7)
    
    df_copy = df_copy.drop(columns=[datetime_col])
    
    return df_copy

In [None]:
df3 = extract_datetime_features(df3, 'date_time')
df3.head()

In [None]:
df3['holiday'] = df3['holiday'].apply(lambda x: 0 if pd.isna(x) else 1)
df3.head()

### df5 - choose target and remove extra cols

In [None]:
df5_valid_targets = ["hospdead", "death", "sfdm2"]
df5_target = df5_valid_targets[0]
df5_columns_to_drop = [col for col in df5_valid_targets if col != df5_target] + ['slos', 'd.time']

df5 = df5.drop(columns=df5_columns_to_drop)
df5.head()

## Common changes

In [None]:
datasets = [
    (df1, 'scaled-sound-pressure'),
    (df2, 'FloodProbability'),
    (df3, 'traffic_volume'),
    (df4, 'critical_temp'),
    (df5, df5_target),
    (df6, 'Trip_Price'),
    (df7, 'BodyFat'),
    (df8, 'Sa'),
    (df9, 'Se'),
]

In [None]:
def preprocess_dataset(df, target_column=None):
    print(f"Original shape: {df.shape}")
    
    # Remove duplicates
    df = df.drop_duplicates()
    print(f"After removing duplicates: {df.shape}")
    
    # Remove columns with a single unique value
    single_value_cols = [col for col in df.columns if df[col].nunique() <= 1]
    df = df.drop(columns=single_value_cols)
    if single_value_cols:
        print(f"Dropped columns with single value: {single_value_cols}")
    print(f"After dropping single-value columns: {df.shape}")
    
    # Remove rows with NaN or missing values in the target column
    if target_column is not None:
        before_drop = df.shape[0]
        df = df.dropna(subset=[target_column])
        after_drop = df.shape[0]
        print(f"Removed {before_drop - after_drop} rows with missing target values.")
    
    # Handle target column if it is categorical
    if target_column is not None:
        is_categorical = df[target_column].dtype == 'object' or df[target_column].dtype == 'category' or df[target_column].dtype == 'bool'
        is_categorical = is_categorical or (df[target_column].dtype.kind in 'ifu' and df[target_column].nunique() < 10)
        
        if is_categorical:
            class_counts = df[target_column].value_counts()
            classes_to_keep = class_counts[class_counts > 1].index
            df = df[df[target_column].isin(classes_to_keep)]
            print(f"Target is categorical - after removing rare classes: {df.shape}")
        else:
            print(f"Target is continuous - skipping removal of rare classes")
    
    # Continue with the rest of the preprocessing...
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    
    if target_column and target_column in numeric_cols:
        numeric_cols.remove(target_column)
    if target_column and target_column in categorical_cols:
        categorical_cols.remove(target_column)
    
    binary_categorical_cols = [col for col in categorical_cols if df[col].nunique() == 2]
    non_binary_categorical_cols = [col for col in categorical_cols if col not in binary_categorical_cols]
    
    print(f"Numeric columns: {len(numeric_cols)}")
    print(f"Binary categorical columns: {len(binary_categorical_cols)}")
    print(f"Non-binary categorical columns: {len(non_binary_categorical_cols)}")
    
    # Handle NaNs
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].mean())
    
    for col in categorical_cols:
        df[col] = df[col].fillna('missing')
    
    # One-hot encoding
    if binary_categorical_cols or non_binary_categorical_cols:
        if non_binary_categorical_cols:
            df = pd.get_dummies(df, columns=non_binary_categorical_cols, dummy_na=False)
        
        if binary_categorical_cols:
            df = pd.get_dummies(df, columns=binary_categorical_cols, drop_first=True, dummy_na=False)

        dummy_cols = [col for col in df.columns if col not in numeric_cols + categorical_cols]
        if target_column:
            dummy_cols = [col for col in dummy_cols if col != target_column]
        for col in dummy_cols:
            if df[col].dtype == bool:
                df[col] = df[col].astype(int)
        
        print(f"Shape after one-hot encoding: {df.shape}")
    
    if target_column:
        X = df.drop(columns=[target_column])
        y = df[target_column].copy()
    else:
        X = df.copy()
        y = None
    
    # Standardize numeric features
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if numeric_cols:
        scaler = StandardScaler()
        X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
    
    if target_column:
        processed_df = pd.concat([X, y], axis=1)
    else:
        processed_df = X
    
    print(f"Final shape after preprocessing: {processed_df.shape}")
    return processed_df

In [None]:
processed_dfs = []
for i, (df, target) in enumerate(datasets):
    print(f"\nProcessing dataset {i+1}:")
    processed_df = preprocess_dataset(df, target)
    processed_dfs.append(processed_df)
    print(f"First 5 rows of processed dataset {i+1}:")
    print(processed_df.head())

## Save data

In [None]:
output_dir = './../processed_data/preprocessed_data_jrsh'
os.makedirs(output_dir, exist_ok=True)

dataset_names = [
    'airfoil_self_noise',
    'flood_probability',
    'traffic_volume',
    'critical_temp',
    'support2',
    'taxi_trip',
    'bodyfat',
    'delta_ailerons',
    'delta_elevators',
]

for i, (df_processed, name) in enumerate(zip(processed_dfs, dataset_names)):
    output_path = os.path.join(output_dir, f"{name}_processed.csv")
    df_processed.to_csv(output_path, index=False)
    print(f"Dataset {i+1} saved to {output_path}")
    print(f"Shape: {df_processed.shape}")