In [4]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from scipy.stats import zscore

# Load the dataset
file_path = "..\..\data\smoking-drinking\smoking_driking_dataset_Ver01.csv"
df = pd.read_csv(file_path)

# Feature list to process
features = [
    "sex", "age", "height", "weight", "waistline", "sight_left", "sight_right", "hear_left", 
    "hear_right", "SBP", "DBP", "BLDS", "tot_chole", "HDL_chole", "LDL_chole", "triglyceride", 
    "hemoglobin", "urine_protein", "serum_creatinine", "SGOT_AST", "SGOT_ALT", "gamma_GTP", 
    "SMK_stat_type_cd", "DRK_YN"
]

# Filter dataset to include only the relevant features
df = df[features]

# Preprocessing pipeline
def preprocess_data(data):
    # Step 1: Handle missing values
    print("Handling missing values...")
    imputer = SimpleImputer(strategy='mean')  # Use mean for numerical columns
    categorical_imputer = SimpleImputer(strategy='most_frequent')  # Mode for categorical columns

    # Separate numerical and categorical columns
    numerical_cols = data.select_dtypes(include=[np.number]).columns
    categorical_cols = data.select_dtypes(include=['object']).columns

    # Impute missing values
    data[numerical_cols] = imputer.fit_transform(data[numerical_cols])
    data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

    # Step 2: Handle outliers using Z-score
    print("Handling outliers...")
    z_threshold = 3
    z_scores = data[numerical_cols].apply(zscore)  # Compute Z-scores for all numerical columns
    data = data[(abs(z_scores) < z_threshold).all(axis=1)]  # Retain rows where all Z-scores are within the threshold
    print("Shape after outlier removal:", data.shape)


    # Step 3: Encode categorical variables
    print("Encoding categorical variables...")
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le  # Save encoders for potential inverse transforms

    # Step 4: Standardize numerical features
    print("Standardizing numerical features...")
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    # Step 5: Apply PCA for dimensionality reduction
    print("Applying PCA...")
    pca = PCA(n_components=0.95)  # Retain 95% variance
    pca_features = pca.fit_transform(data[numerical_cols])

    # Replace numerical columns with PCA-transformed features
    pca_columns = [f"PCA_{i+1}" for i in range(pca_features.shape[1])]
    pca_df = pd.DataFrame(pca_features, columns=pca_columns)

    # Combine PCA-transformed features with categorical features
    final_data = pd.concat([pca_df.reset_index(drop=True), data[categorical_cols].reset_index(drop=True)], axis=1)

    return final_data

# Apply preprocessing
df = preprocess_data(df)

# Display the preprocessed DataFrame
print("Preprocessing completed. Final DataFrame:")
print(df.head())

# Save the preprocessed DataFrame to a CSV file (optional)
df.to_csv("preprocessed_data.csv", index=False)

Handling missing values...
Handling outliers...
Shape after outlier removal: (846398, 24)
Encoding categorical variables...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = le.fit_transform(data[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = le.fit_transform(data[col])


Standardizing numerical features...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


Applying PCA...
Preprocessing completed. Final DataFrame:
      PCA_1     PCA_2     PCA_3     PCA_4     PCA_5     PCA_6     PCA_7  \
0  2.025544 -0.839784  0.093497 -0.391787 -0.352669  0.355572 -0.164468   
1  2.922318 -1.236869  1.051027 -1.166578 -0.399198  0.857755 -0.183687   
2  2.176327 -1.141228 -1.773216  2.290390  0.892830 -0.814453 -0.163571   
3  2.788370 -0.955241  0.200883 -0.923216  1.914884  1.290956 -0.225850   
4 -0.394487  0.184766  0.225146 -1.362446  1.285946 -0.096540 -0.117903   

      PCA_8     PCA_9    PCA_10    PCA_11    PCA_12    PCA_13    PCA_14  \
0 -0.800276 -0.820335 -0.132950 -0.989938 -0.346889  0.523006  1.393215   
1 -0.007633 -0.208420 -0.902020 -0.591838  1.065905 -0.207177 -0.626641   
2 -0.860071 -0.888167  0.363757  0.057846 -0.152175  0.582983  0.951937   
3 -0.473241 -1.610877  0.626448 -0.055571 -0.877178 -1.404855  1.232052   
4  0.100565 -0.316057 -0.071800 -0.052944 -0.232403  0.185553  0.093275   

     PCA_15  sex  DRK_YN  
0 -0.471360  