In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA

# 1. Load CSV 
df = pd.read_csv("C:\\Users\\NIL_07\\Downloads\\2025-Y2-S1-MLB-B4G2-08\\data\\raw\\Lung Cancer.csv")

# 2. Handle missing values danusha
# Numeric → fill with median, Categorical → fill with mode
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

# 3. Outlier Removal using IQR for numeric columns me
numeric_cols = df.select_dtypes(include=[np.number]).columns
df_clean = df.copy()
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

# 4. Encode categorical variables chanuthi
categorical_cols = df_clean.select_dtypes(include=['object']).columns

# Example: if 'cancer_stage' is ordinal, encode with LabelEncoder
if 'cancer_stage' in categorical_cols:
    le = LabelEncoder()
    df_clean['cancer_stage'] = le.fit_transform(df_clean['cancer_stage'])
    categorical_cols = categorical_cols.drop('cancer_stage')

# 4. Encode categorical variables (efficient encoding)
categorical_cols = df_clean.select_dtypes(include=['object']).columns

# High-cardinality threshold (decide when to switch to LabelEncoder)
high_cardinality_threshold = 20  

for col in categorical_cols:
    if df_clean[col].nunique() > high_cardinality_threshold:
        # Label encode high-cardinality columns
        le = LabelEncoder()
        df_clean[col] = le.fit_transform(df_clean[col])
    else:
        # One-hot encode low-"cardinality columns
        df_clean = pd.get_dummies(df_clean, columns=[col], drop_first=True)


# 5. Separate features & target yasith
X = df_clean.drop("survived", axis = 1)   # replace 'target' with your actual label column
y = df_clean["survived"]

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)




# 8. Feature Scaling abishek
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 9. PCA for dimensionality reduction danusha
pca = PCA(n_components=0.95)  # keep 95% variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("Original features:", X_train_scaled.shape[1])
print("Reduced features:", X_train_pca.shape[1])

# 10. Save processed dataset with PCA components to CSV
# Convert back to DataFrame for saving
pca_columns = [f"PC{i+1}" for i in range(X_train_pca.shape[1])]

# Combine PCA features + target
train_pca_df = pd.DataFrame(X_train_pca, columns=pca_columns)
train_pca_df["target"] = y_train.values  # balanced training target

test_pca_df = pd.DataFrame(X_test_pca, columns=pca_columns)
test_pca_df["target"] = y_test.values  # original test target

# Save to CSV files me
train_pca_df.to_csv("processed_train_pca.csv", index=False)
test_pca_df.to_csv("processed_test_pca.csv", index=False)

print("✅ Processed datasets saved as 'processed_train_pca.csv' and 'processed_test_pca.csv'")




Original features: 20
Reduced features: 13
✅ Processed datasets saved as 'processed_train_pca.csv' and 'processed_test_pca.csv'
