# Preprocessing

Data sources:
- [Taxi price](https://www.kaggle.com/datasets/denkuznetz/taxi-price-prediction/data?fbclid=IwZXh0bgNhZW0CMTAAAR3EXwfcJrO4pqnY2pId6K4qb2sd01_YLdKmfEKlIZ8bxhvxc2iwQKw01ik_aem_yNsb3V7ymqavEDGZfZZBJA)
- [Flood Prediction](https://www.kaggle.com/datasets/naiyakhalid/flood-prediction-dataset?fbclid=IwZXh0bgNhZW0CMTAAAR31KS5Uu8KLo3FP9L0kwUM1iDdiYWZCD2q3fuxnrhup8UB4D-n5KLOinSg_aem_eziiiOqts_lvLdyq2uftIQ)
- [Airfoil Self-Noise](https://archive.ics.uci.edu/dataset/291/airfoil+self+noise?fbclid=IwZXh0bgNhZW0CMTAAAR1qsj7CYd84jygC5CQnGTuBnMqopmtgGYAFnm3EtJaz0BBv735Dhu2fut8_aem_lBvbJTztOR47e3Vk2nuNOw)
- [Superconductivty Data](https://archive.ics.uci.edu/dataset/464/superconductivty+data?fbclid=IwZXh0bgNhZW0CMTAAAR3dr3Eitf5ZHCekr4hKhaqapt2sgtuSCpSFWogqI5F0o9bnE9edFQro_mg_aem_CYa2w2f-668q3LO-k_DElg)
- [SUPPORT2](https://archive.ics.uci.edu/dataset/880/support2?fbclid=IwZXh0bgNhZW0CMTAAAR3dr3Eitf5ZHCekr4hKhaqapt2sgtuSCpSFWogqI5F0o9bnE9edFQro_mg_aem_CYa2w2f-668q3LO-k_DElg)
- [Metro Interstate Traffic Volume](https://archive.ics.uci.edu/dataset/492/metro+interstate+traffic+volume?fbclid=IwZXh0bgNhZW0CMTAAAR3dr3Eitf5ZHCekr4hKhaqapt2sgtuSCpSFWogqI5F0o9bnE9edFQro_mg_aem_CYa2w2f-668q3LO-k_DElg)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

## Load data

In [2]:
df1 = pd.read_csv('./data/airfoil_self_noise.dat', sep='\\s+')
df1.head()

Unnamed: 0,frequency,angle_of_attack,chord-length,free-stream-velocity,suction-side-displacement-thickness,scaled-sound-pressure
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [3]:
df2 = pd.read_csv('./data/flood.csv')
df2.head()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,3,8,6,6,4,4,6,2,3,2,...,10,7,4,2,3,4,3,2,6,0.45
1,8,4,5,7,7,9,1,5,5,4,...,9,2,6,2,1,1,9,1,3,0.475
2,3,10,4,1,7,5,4,7,4,9,...,7,4,4,8,6,1,8,3,6,0.515
3,4,4,2,7,3,4,1,4,6,4,...,4,2,6,6,8,8,6,6,10,0.52
4,3,7,5,2,5,8,5,2,7,5,...,7,6,5,3,3,4,4,3,4,0.475


In [4]:
df3 = pd.read_csv('./data/Metro_Interstate_Traffic_Volume.csv')
df3.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


In [5]:
df4 = pd.read_csv("./data/superconductivity.csv")
df4.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,2.257143,2.213364,2.219783,1.368922,1.066221,1,1.085714,0.433013,0.437059,29.0
1,5,92.729214,58.518416,73.132787,36.396602,1.449309,1.057755,122.90607,36.161939,47.094633,...,2.257143,1.888175,2.210679,1.557113,1.047221,2,1.128571,0.632456,0.468606,26.0
2,4,88.944468,57.885242,66.361592,36.122509,1.181795,0.97598,122.90607,35.741099,51.968828,...,2.271429,2.213364,2.232679,1.368922,1.029175,1,1.114286,0.433013,0.444697,19.0
3,4,88.944468,57.873967,66.361592,36.11956,1.181795,1.022291,122.90607,33.76801,51.968828,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.1,0.433013,0.440952,22.0
4,4,88.944468,57.840143,66.361592,36.110716,1.181795,1.129224,122.90607,27.848743,51.968828,...,2.242857,2.213364,2.206963,1.368922,1.096052,1,1.057143,0.433013,0.428809,23.0


In [6]:
df5 = pd.read_csv("./data/support2.csv")
df5.head()

Unnamed: 0,age,death,sex,hospdead,slos,d.time,dzgroup,dzclass,num.co,edu,...,crea,sod,ph,glucose,bun,urine,adlp,adls,sfdm2,adlsc
1,62.84998,0,male,0,5,2029,Lung Cancer,Cancer,0,11.0,...,1.199951,141.0,7.459961,,,,7.0,7.0,,7.0
2,60.33899,1,female,1,4,4,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,...,5.5,132.0,7.25,,,,,1.0,<2 mo. follow-up,1.0
3,52.74698,1,female,0,17,47,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,...,2.0,134.0,7.459961,,,,1.0,0.0,<2 mo. follow-up,0.0
4,42.38498,1,female,0,3,133,Lung Cancer,Cancer,2,11.0,...,0.799927,139.0,,,,,0.0,0.0,no(M2 and SIP pres),0.0
5,79.88495,0,female,0,16,2029,ARF/MOSF w/Sepsis,ARF/MOSF,1,,...,0.799927,143.0,7.509766,,,,,2.0,no(M2 and SIP pres),2.0


In [7]:
df6 = pd.read_csv("./data/taxi_trip_pricing.csv")
df6.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


## Manipulate special columns

### df3 - change datetime and holidays column

In [8]:
def extract_datetime_features(df, datetime_col):
    df_copy = df.copy()
    
    print(f"Processing datetime column: {datetime_col}")
    if not pd.api.types.is_datetime64_any_dtype(df_copy[datetime_col]):
        df_copy[datetime_col] = pd.to_datetime(df_copy[datetime_col], errors='coerce')
    
    df_copy[f'{datetime_col}_year'] = df_copy[datetime_col].dt.year
    df_copy[f'{datetime_col}_month'] = df_copy[datetime_col].dt.month
    df_copy[f'{datetime_col}_day'] = df_copy[datetime_col].dt.day
    df_copy[f'{datetime_col}_hour'] = df_copy[datetime_col].dt.hour
    df_copy[f'{datetime_col}_dayofweek'] = df_copy[datetime_col].dt.dayofweek
    df_copy[f'{datetime_col}_quarter'] = df_copy[datetime_col].dt.quarter
    df_copy[f'{datetime_col}_is_weekend'] = df_copy[datetime_col].dt.dayofweek >= 5
    
    df_copy[f'{datetime_col}_month_sin'] = np.sin(2 * np.pi * df_copy[datetime_col].dt.month / 12)
    df_copy[f'{datetime_col}_month_cos'] = np.cos(2 * np.pi * df_copy[datetime_col].dt.month / 12)
    df_copy[f'{datetime_col}_hour_sin'] = np.sin(2 * np.pi * df_copy[datetime_col].dt.hour / 24)
    df_copy[f'{datetime_col}_hour_cos'] = np.cos(2 * np.pi * df_copy[datetime_col].dt.hour / 24)
    df_copy[f'{datetime_col}_dayofweek_sin'] = np.sin(2 * np.pi * df_copy[datetime_col].dt.dayofweek / 7)
    df_copy[f'{datetime_col}_dayofweek_cos'] = np.cos(2 * np.pi * df_copy[datetime_col].dt.dayofweek / 7)
    
    df_copy = df_copy.drop(columns=[datetime_col])
    
    return df_copy

In [9]:
df3 = extract_datetime_features(df3, 'date_time')
df3.head()

Processing datetime column: date_time


Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date_time_year,date_time_month,...,date_time_hour,date_time_dayofweek,date_time_quarter,date_time_is_weekend,date_time_month_sin,date_time_month_cos,date_time_hour_sin,date_time_hour_cos,date_time_dayofweek_sin,date_time_dayofweek_cos
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,5545,2012,10,...,9,1,4,False,-0.866025,0.5,0.7071068,-0.707107,0.781831,0.62349
1,,289.36,0.0,0.0,75,Clouds,broken clouds,4516,2012,10,...,10,1,4,False,-0.866025,0.5,0.5,-0.866025,0.781831,0.62349
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,4767,2012,10,...,11,1,4,False,-0.866025,0.5,0.258819,-0.965926,0.781831,0.62349
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,5026,2012,10,...,12,1,4,False,-0.866025,0.5,1.224647e-16,-1.0,0.781831,0.62349
4,,291.14,0.0,0.0,75,Clouds,broken clouds,4918,2012,10,...,13,1,4,False,-0.866025,0.5,-0.258819,-0.965926,0.781831,0.62349


In [10]:
df3['holiday'] = df3['holiday'].apply(lambda x: 0 if pd.isna(x) else 1)
df3.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date_time_year,date_time_month,...,date_time_hour,date_time_dayofweek,date_time_quarter,date_time_is_weekend,date_time_month_sin,date_time_month_cos,date_time_hour_sin,date_time_hour_cos,date_time_dayofweek_sin,date_time_dayofweek_cos
0,0,288.28,0.0,0.0,40,Clouds,scattered clouds,5545,2012,10,...,9,1,4,False,-0.866025,0.5,0.7071068,-0.707107,0.781831,0.62349
1,0,289.36,0.0,0.0,75,Clouds,broken clouds,4516,2012,10,...,10,1,4,False,-0.866025,0.5,0.5,-0.866025,0.781831,0.62349
2,0,289.58,0.0,0.0,90,Clouds,overcast clouds,4767,2012,10,...,11,1,4,False,-0.866025,0.5,0.258819,-0.965926,0.781831,0.62349
3,0,290.13,0.0,0.0,90,Clouds,overcast clouds,5026,2012,10,...,12,1,4,False,-0.866025,0.5,1.224647e-16,-1.0,0.781831,0.62349
4,0,291.14,0.0,0.0,75,Clouds,broken clouds,4918,2012,10,...,13,1,4,False,-0.866025,0.5,-0.258819,-0.965926,0.781831,0.62349


### df5 - choose target and remove extra cols

In [11]:
df5_valid_targets = ["hospdead", "death", "sfdm2"]
df5_target = df5_valid_targets[0]
df5_columns_to_drop = [col for col in df5_valid_targets if col != df5_target] + ['slos', 'd.time']

df5 = df5.drop(columns=df5_columns_to_drop)
df5.head()

Unnamed: 0,age,sex,hospdead,dzgroup,dzclass,num.co,edu,income,scoma,charges,...,bili,crea,sod,ph,glucose,bun,urine,adlp,adls,adlsc
1,62.84998,male,0,Lung Cancer,Cancer,0,11.0,$11-$25k,0.0,9715.0,...,0.199982,1.199951,141.0,7.459961,,,,7.0,7.0,7.0
2,60.33899,female,1,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,$11-$25k,44.0,34496.0,...,,5.5,132.0,7.25,,,,,1.0,1.0
3,52.74698,female,0,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,under $11k,0.0,41094.0,...,2.199707,2.0,134.0,7.459961,,,,1.0,0.0,0.0
4,42.38498,female,0,Lung Cancer,Cancer,2,11.0,under $11k,0.0,3075.0,...,,0.799927,139.0,,,,,0.0,0.0,0.0
5,79.88495,female,0,ARF/MOSF w/Sepsis,ARF/MOSF,1,,,26.0,50127.0,...,,0.799927,143.0,7.509766,,,,,2.0,2.0


## Common changes

In [None]:
datasets = [
    (df1, 'scaled-sound-pressure'),
    (df2, 'FloodProbability'),
    (df3, 'traffic_volume'),
    (df4, 'critical_temp'),
    (df5, df5_target),
    (df6, 'Trip_Price')
]

In [13]:
def preprocess_dataset(df, target_column=None):
    print(f"Original shape: {df.shape}")
    
    # duplicates
    df = df.drop_duplicates()
    print(f"After removing duplicates: {df.shape}")
    
    # columns with a single unique value
    single_value_cols = [col for col in df.columns if df[col].nunique() <= 1]
    df = df.drop(columns=single_value_cols)
    if single_value_cols:
        print(f"Dropped columns with single value: {single_value_cols}")
    print(f"After dropping single-value columns: {df.shape}")
    
    # handle target column if is categorical
    if target_column is not None:
        is_categorical = df[target_column].dtype == 'object' or df[target_column].dtype == 'category' or df[target_column].dtype == 'bool'
        is_categorical = is_categorical or (df[target_column].dtype.kind in 'ifu' and df[target_column].nunique() < 10)
        
        if is_categorical:
            class_counts = df[target_column].value_counts()
            classes_to_keep = class_counts[class_counts > 1].index
            df = df[df[target_column].isin(classes_to_keep)]
            print(f"Target is categorical - after removing rare classes: {df.shape}")
        else:
            print(f"Target is continuous - skipping removal of rare classes")
    
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    
    if target_column and target_column in numeric_cols:
        numeric_cols.remove(target_column)
    if target_column and target_column in categorical_cols:
        categorical_cols.remove(target_column)
    
    binary_categorical_cols = [col for col in categorical_cols if df[col].nunique() == 2]
    non_binary_categorical_cols = [col for col in categorical_cols if col not in binary_categorical_cols]
    
    print(f"Numeric columns: {len(numeric_cols)}")
    print(f"Binary categorical columns: {len(binary_categorical_cols)}")
    print(f"Non-binary categorical columns: {len(non_binary_categorical_cols)}")
    
    # Handle NaNs
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].mean())
    
    for col in categorical_cols:
        df[col] = df[col].fillna('missing')
    
    # one-hot encoding
    if binary_categorical_cols or non_binary_categorical_cols:
        if non_binary_categorical_cols:
            df = pd.get_dummies(df, columns=non_binary_categorical_cols, dummy_na=False)
        
        if binary_categorical_cols:
            df = pd.get_dummies(df, columns=binary_categorical_cols, drop_first=True, dummy_na=False)

        dummy_cols = [col for col in df.columns if col not in numeric_cols + categorical_cols]
        if target_column:
            dummy_cols = [col for col in dummy_cols if col != target_column]
        for col in dummy_cols:
            if df[col].dtype == bool:
                df[col] = df[col].astype(int)
        
        print(f"Shape after one-hot encoding: {df.shape}")
    
    if target_column:
        X = df.drop(columns=[target_column])
        y = df[target_column].copy()
    else:
        X = df.copy()
        y = None
    
    # standarize numeric features
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if numeric_cols:
        scaler = StandardScaler()
        X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
    
    if target_column:
        processed_df = pd.concat([X, y], axis=1)
    else:
        processed_df = X
    
    print(f"Final shape after preprocessing: {processed_df.shape}")
    return processed_df

In [14]:
processed_dfs = []
for i, (df, target) in enumerate(datasets):
    print(f"\nProcessing dataset {i+1}:")
    processed_df = preprocess_dataset(df, target)
    processed_dfs.append(processed_df)
    print(f"First 5 rows of processed dataset {i+1}:")
    print(processed_df.head())


Processing dataset 1:
Original shape: (1503, 6)
After removing duplicates: (1503, 6)
After dropping single-value columns: (1503, 6)
Target is continuous - skipping removal of rare classes
Numeric columns: 5
Binary categorical columns: 0
Non-binary categorical columns: 0
Final shape after preprocessing: (1503, 6)
First 5 rows of processed dataset 1:
   frequency  angle_of_attack  chord-length  free-stream-velocity  \
0  -0.662023        -1.146403      1.799299              1.312935   
1  -0.598561        -1.146403      1.799299              1.312935   
2  -0.519235        -1.146403      1.799299              1.312935   
3  -0.408177        -1.146403      1.799299              1.312935   
4  -0.281255        -1.146403      1.799299              1.312935   

   suction-side-displacement-thickness  scaled-sound-pressure  
0                            -0.644805                126.201  
1                            -0.644805                125.201  
2                            -0.644805   

## Save data

In [15]:
output_dir = './../processed_data/preprocessed_data_jrsh'
os.makedirs(output_dir, exist_ok=True)

dataset_names = [
    'airfoil_self_noise',
    'flood_probability',
    'traffic_volume',
    'critical_temp',
    'support2',
    'taxi_trip'
]

for i, (df_processed, name) in enumerate(zip(processed_dfs, dataset_names)):
    output_path = os.path.join(output_dir, f"{name}_processed.csv")
    df_processed.to_csv(output_path, index=False)
    print(f"Dataset {i+1} saved to {output_path}")
    print(f"Shape: {df_processed.shape}")

Dataset 1 saved to ./../processed_data/preprocessed_data_jrsh\airfoil_self_noise_processed.csv
Shape: (1503, 6)
Dataset 2 saved to ./../processed_data/preprocessed_data_jrsh\flood_probability_processed.csv
Shape: (50000, 21)
Dataset 3 saved to ./../processed_data/preprocessed_data_jrsh\traffic_volume_processed.csv
Shape: (48187, 68)
Dataset 4 saved to ./../processed_data/preprocessed_data_jrsh\critical_temp_processed.csv
Shape: (21197, 82)
Dataset 5 saved to ./../processed_data/preprocessed_data_jrsh\support2_processed.csv
Shape: (9105, 67)
Dataset 6 saved to ./../processed_data/preprocessed_data_jrsh\taxi_trip_processed.csv
Shape: (1000, 22)
