In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
data_path = '../data/COPD_Data_Nepal.csv'
df = pd.read_csv(data_path)

In [5]:
# One-hot encoding for categorical variables
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_gender = encoder.fit_transform(df[['Gender']])
encoded_smoking = encoder.fit_transform(df[['Smoking_Status']])
encoded_location = encoder.fit_transform(df[['Location']])

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [31]:
# One-hot encoding for categorical variables
gender_encoder = OneHotEncoder(drop='first', sparse_output=False)
smoking_encoder = OneHotEncoder(drop='first', sparse_output=False)
location_encoder = OneHotEncoder(drop='first', sparse_output=False)

# Encode 'Gender', 'Smoking_Status', and 'Location'

encoded_gender = encoder.fit_transform(df[['Gender']])
encoded_smoking = encoder.fit_transform(df[['Smoking_Status']])
encoded_location = encoder.fit_transform(df[['Location']])

In [33]:
# Add encoded columns
gender_columns = [f"Gender_{cat}" for cat in encoder.categories_[0][1:]]
smoking_columns = [f"Smoking_Status_{cat}" for cat in encoder.categories_[0][1:]]
location_columns = [f"Location_{cat}" for cat in encoder.categories_[0][1:]]

In [37]:
encoded_gender = gender_encoder.fit_transform(df[['Gender']])
encoded_smoking = smoking_encoder.fit_transform(df[['Smoking_Status']])
encoded_location = location_encoder.fit_transform(df[['Location']])

In [39]:
# Step 2: Verify shapes to see how many columns were generated
print(f"Encoded Gender Shape: {encoded_gender.shape}")
print(f"Encoded Smoking Shape: {encoded_smoking.shape}")
print(f"Encoded Location Shape: {encoded_location.shape}")

# Step 3: Generate appropriate column names based on the actual number of categories
# Check which categories are in each feature
print(f"Gender Categories: {gender_encoder.categories_[0]}")
print(f"Smoking Categories: {smoking_encoder.categories_[0]}")
print(f"Location Categories: {location_encoder.categories_[0]}")

Encoded Gender Shape: (1800, 1)
Encoded Smoking Shape: (1800, 2)
Encoded Location Shape: (1800, 9)
Gender Categories: ['Female' 'Male']
Smoking Categories: ['Current' 'Former' 'Never']
Location Categories: ['Bhaktapur' 'Biratnagar' 'Butwal' 'Chitwan' 'Dharan' 'Hetauda'
 'Kathmandu' 'Lalitpur' 'Nepalgunj' 'Pokhara']


In [41]:
# Assign column names based on the categories after dropping the first one
gender_columns = [f"Gender_{cat}" for cat in gender_encoder.categories_[0][1:]]  # Skip the first category
smoking_columns = [f"Smoking_Status_{cat}" for cat in smoking_encoder.categories_[0][1:]]  # Skip the first category
location_columns = [f"Location_{cat}" for cat in location_encoder.categories_[0][1:]]  # Skip the first category

In [43]:
# Drop original categorical columns and concatenate the one-hot encoded columns
df_encoded = pd.concat([df.drop(['Gender', 'Smoking_Status', 'Location'], axis=1),
                        pd.DataFrame(encoded_gender, columns=gender_columns),
                        pd.DataFrame(encoded_smoking, columns=smoking_columns),
                        pd.DataFrame(encoded_location, columns=location_columns)], axis=1)

In [45]:
# Standardize numerical features
scaler = StandardScaler()
scaled_columns = ['Age', 'BMI', 'Air_Pollution_Level']
df_encoded[scaled_columns] = scaler.fit_transform(df_encoded[scaled_columns])

In [47]:
#Save preprocessed data (optional)
df_encoded.to_csv('../data/copd_preprocessed.csv', index=False)


In [49]:
# Train-test split
X = df_encoded.drop('COPD_Diagnosis', axis=1)
y = df_encoded['COPD_Diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [53]:
# Print final shape of training and test sets
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")

Training set shape: (1260, 19), Test set shape: (540, 19)
