In [8]:
# Let's load the cleaned healthcare dataset provided by the user and perform detailed feature engineering.
import pandas as pd

# Load the cleaned healthcare dataset
data = pd.read_csv('Cleaned_healthcare_dataset.csv')

# Display the first few rows and column names to understand the structure
data_info = data.info()
data_head = data.head()
data_columns = data.columns

data_info, data_head, data_columns.tolist()  # Display dataset info and columns to guide feature engineering


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51000 entries, 0 to 50999
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                51000 non-null  object 
 1   Age                 51000 non-null  int64  
 2   Gender              51000 non-null  int64  
 3   Medical Condition   51000 non-null  int64  
 4   Date of Admission   51000 non-null  object 
 5   Doctor              51000 non-null  int64  
 6   Hospital            51000 non-null  int64  
 7   Insurance Provider  51000 non-null  int64  
 8   Billing Amount      51000 non-null  float64
 9   Room Number         51000 non-null  int64  
 10  Discharge Date      51000 non-null  object 
 11  Medication          51000 non-null  int64  
 12  Blood Type_0        51000 non-null  int64  
 13  Blood Type_1        51000 non-null  int64  
 14  Blood Type_2        51000 non-null  int64  
 15  Blood Type_3        51000 non-null  int64  
 16  Bloo

(None,
         Name  Age  Gender  Medical Condition Date of Admission  Doctor  \
 0  Patient_0   58       1                  3        2023-01-01       3   
 1  Patient_1   71       0                  1        2023-01-02       7   
 2  Patient_2   48       0                  2        2023-01-03       7   
 3  Patient_3   34       1                  0        2023-01-04       7   
 4  Patient_4   62       0                  3        2023-01-05       4   
 
    Hospital  Insurance Provider  Billing Amount  Room Number  ...  \
 0         3                   1       10.668620          229  ...   
 1         0                   4        9.981405          431  ...   
 2         0                   0       10.139826          277  ...   
 3         3                   0       10.146779          372  ...   
 4         3                   0       10.469040          390  ...   
 
   Blood Type_4  Blood Type_5  Blood Type_6  Blood Type_7  Admission Type_0  \
 0            0             0           

In [9]:
# Step 1: Date-Based Features
data['Date of Admission'] = pd.to_datetime(data['Date of Admission'])
data['Discharge Date'] = pd.to_datetime(data['Discharge Date'])

# Calculate Length of Stay
data['Length of Stay'] = (data['Discharge Date'] - data['Date of Admission']).dt.days

# Extract day, month, and weekday from Date of Admission
data['Admission Day'] = data['Date of Admission'].dt.day
data['Admission Month'] = data['Date of Admission'].dt.month
data['Admission Weekday'] = data['Date of Admission'].dt.weekday

# Drop date columns after extracting features
data.drop(columns=['Date of Admission', 'Discharge Date'], inplace=True)

# Step 2: Interaction Terms
# Age and Length of Stay interaction
data['Age x Length of Stay'] = data['Age'] * data['Length of Stay']

# Interaction between Admission Type and Billing Amount
# Assuming Admission Type columns are one-hot encoded, we create an interaction with Billing Amount
data['Admission Type_0 x Billing'] = data['Admission Type_0'] * data['Billing Amount']
data['Admission Type_1 x Billing'] = data['Admission Type_1'] * data['Billing Amount']
data['Admission Type_2 x Billing'] = data['Admission Type_2'] * data['Billing Amount']

# Step 3: Cost-Related Features
# Average Billing per Day (Billing Amount / Length of Stay)
data['Avg Billing per Day'] = data['Billing Amount'] / data['Length of Stay']

# Billing per Age to capture age-related billing patterns
data['Billing per Age'] = data['Billing Amount'] / (data['Age'] + 1)

# Display the first few rows of the dataset with engineered features
data[['Length of Stay', 'Admission Day', 'Admission Month', 'Admission Weekday', 'Age x Length of Stay', 
      'Admission Type_0 x Billing', 'Admission Type_1 x Billing', 'Admission Type_2 x Billing', 
      'Avg Billing per Day', 'Billing per Age']].head()


Unnamed: 0,Length of Stay,Admission Day,Admission Month,Admission Weekday,Age x Length of Stay,Admission Type_0 x Billing,Admission Type_1 x Billing,Admission Type_2 x Billing,Avg Billing per Day,Billing per Age
0,1,1,1,6,58,10.66862,0.0,0.0,10.66862,0.180824
1,1,2,1,0,71,0.0,9.981405,0.0,9.981405,0.138631
2,1,3,1,1,48,10.139826,0.0,0.0,10.139826,0.206935
3,1,4,1,2,34,10.146779,0.0,0.0,10.146779,0.289908
4,1,5,1,3,62,10.46904,0.0,0.0,10.46904,0.166175


In [10]:
# Step 1: Cost per Day of Age
data['Cost per Day of Age'] = data['Billing Amount'] / ((data['Age'] + 1) * data['Length of Stay'])

# Step 2: Doctor and Hospital Cost Averages
doctor_avg_billing = data.groupby('Doctor')['Billing Amount'].transform('mean')
data['Doctor Avg Billing'] = doctor_avg_billing

hospital_avg_billing = data.groupby('Hospital')['Billing Amount'].transform('mean')
data['Hospital Avg Billing'] = hospital_avg_billing

# Step 3: Admission Season
# Map months to seasons
season_map = {12: 'Winter', 1: 'Winter', 2: 'Winter', 
              3: 'Spring', 4: 'Spring', 5: 'Spring', 
              6: 'Summer', 7: 'Summer', 8: 'Summer', 
              9: 'Fall', 10: 'Fall', 11: 'Fall'}
data['Admission Season'] = data['Admission Month'].map(season_map)

# Step 4: Weekend Admission
data['Is Weekend Admission'] = data['Admission Weekday'].apply(lambda x: 1 if x >= 5 else 0)

# Step 5: Senior Citizen Flag
data['Is Senior Citizen'] = data['Age'].apply(lambda x: 1 if x >= 65 else 0)

# Step 6: Gender x Medical Condition Interaction
# Example: Gender interaction with a hypothetical medical condition code (assuming medical condition is encoded)
data['Gender x Diabetes'] = data['Gender'] * data['Medical Condition'].apply(lambda x: 1 if x == 'Diabetes' else 0)

# Step 7: Aggregate Features for Medical Condition
# Average billing and length of stay by medical condition
billing_by_condition = data.groupby('Medical Condition')['Billing Amount'].transform('mean')
data['Avg Billing by Condition'] = billing_by_condition

stay_by_condition = data.groupby('Medical Condition')['Length of Stay'].transform('mean')
data['Avg Stay by Condition'] = stay_by_condition

# Step 8: Frequency-Based Features for High-Cardinality Categorical Variables
# Count frequency of each doctor and hospital
data['Doctor Patient Count'] = data.groupby('Doctor')['Doctor'].transform('count')
data['Hospital Patient Count'] = data.groupby('Hospital')['Hospital'].transform('count')

# Display newly engineered features
data[['Cost per Day of Age', 'Doctor Avg Billing', 'Hospital Avg Billing', 'Admission Season', 
      'Is Weekend Admission', 'Is Senior Citizen', 'Gender x Diabetes', 'Avg Billing by Condition', 
      'Avg Stay by Condition', 'Doctor Patient Count', 'Hospital Patient Count']].head()


Unnamed: 0,Cost per Day of Age,Doctor Avg Billing,Hospital Avg Billing,Admission Season,Is Weekend Admission,Is Senior Citizen,Gender x Diabetes,Avg Billing by Condition,Avg Stay by Condition,Doctor Patient Count,Hospital Patient Count
0,0.180824,9.909797,9.895412,Winter,1,0,0,9.890907,1.0,6369,10201
1,0.138631,9.89575,9.91067,Winter,0,1,0,9.900552,1.0,6555,10349
2,0.206935,9.89575,9.91067,Winter,0,0,0,9.9163,1.0,6555,10349
3,0.289908,9.89575,9.895412,Winter,0,0,0,9.90037,1.0,6555,10201
4,0.166175,9.905918,9.895412,Winter,0,0,0,9.890907,1.0,6404,10201


In [12]:
# Export Feature Engineering data to a new file
data.to_csv('Feature Engineering_dataset.csv', index=False)