In [1]:
import pandas as pd

# Load your dataset
df = pd.read_csv('health.csv')

# Quick view
print(df.shape)
print(df.head())


for col in df.columns:
    print(f"Unique values in column '{col}':")
    print(df[col].unique())
    print("\n")


(1652, 243)
   FPrimary  Person_ID  compound  Mother_ID  surveyor_ID survey_date  consent  \
0       301        102         3        100           36  10/31/2014      1.0   
1       301        101         3        100           36  10/31/2014      1.0   
2       601        101         6        100           36  10/30/2014      1.0   
3       602        105         6        100           36  10/30/2014      1.0   
4       602        310         6        300           36  10/30/2014      1.0   

   child_consent_  TrtOrder  TrtOrder2013  ...  Ill_days_total  \
0             9.0       1.0           1.0  ...               3   
1             9.0       1.0           1.0  ...               0   
2             9.0       1.0           1.0  ...               0   
3             9.0       1.0           1.0  ...               0   
4             9.0       1.0           1.0  ...               0   

   Blood_Stool_last_week  Three_Stool_last_week  MUAC_danger  \
0                      0                

In [3]:
df=df.drop(['Mother_ID', 'surveyor_ID', 'Person_ID','survey_date', 'ORT_water', 'Hospital_day_6_', 'Hospital_day_3_', 
    'child_present_', 'consent', 'RelsFA', 'ageunit', 'Hospital_day_1_',
    'Hospital_day_4_', 'Hospital_day_7_', 'Previous_water_source_other', 'ORT_sugar', 'Private_Clinic_day_3_', 'ORT_salt','Treatment1','Treatment2','Treatment3','Treatment4'], axis='columns')


In [5]:
print(df.isnull().sum())


FPrimary                         0
compound                         0
child_consent_                  79
TrtOrder                         3
TrtOrder2013                    85
                                ..
Months_breastfeeding_correct     0
ORT_ingr_correct                 0
tag_HH                           0
tag_C                            0
tag_M                            0
Length: 221, dtype: int64


In [7]:
# Drop columns with more than 50% missing
df = df.dropna(thresh=0.5*len(df), axis=1)

# Fill categorical columns safely
cat_cols = df.select_dtypes(include=['object']).columns

if len(cat_cols) > 0:
    mode_values = df[cat_cols].mode()
    if not mode_values.empty:
        df[cat_cols] = df[cat_cols].fillna(mode_values.iloc[0])


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)


In [17]:
import numpy as np
# Correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation > 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

df_reduced = df.drop(columns=to_drop)


In [19]:
# Remove low variance features
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.01)
df_selected = selector.fit_transform(df_scaled)

# 5. Variance Threshold - remove low variance features
selector = VarianceThreshold(threshold=0.01)
df_var_selected = selector.fit_transform(df_scaled)
selected_columns_var = df_scaled.columns[selector.get_support()]

df_var_selected = pd.DataFrame(df_var_selected, columns=selected_columns_var)

# 6. Correlation Reduction - remove highly correlated features
corr_matrix = df_var_selected.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_corr = [column for column in upper.columns if any(upper[column] > 0.95)]

df_final = df_var_selected.drop(columns=to_drop_corr)

# 7. Print the final columns
print(f"Total columns after preprocessing: {len(df_final.columns)}\n")
print("Selected Columns:")
print(df_final.columns.tolist())


Total columns after preprocessing: 119

Selected Columns:
['FPrimary', 'child_consent_', 'TrtOrder', 'AttrStatus', 'same_compound', 'same_mother', 'Stratum', 'HHmembers_12', 'GenderFA', 'LitFA', 'LangFA', 'SalaryFA', 'OOccupier', 'LogAssets', 'OldFA', 'dist1', 'Diarrhea_2013', 'age_2014', 'agegroup_2014', 'gender', 'vaccine_card_available', 'vaccines_taken', 'health_program_', 'health_program_name_', 'health_program_benefit_', 'health_worker_visits_check_', 'health_worker_last_2_weeks_', 'health_worker_provides_', 'free_care_check_', 'MUAC_Unadjusted', 'child_weighing_type_', 'child_weight_1_', 'vaccine_card_', 'BCG_', 'Polio_0_', 'Polio_1_', 'Polio_2_', 'Polio_3_', 'DTCoq_1_', 'DTCoq_2_', 'DTCoq_3_', 'HepatitisB_1_', 'HepatitisB_2_', 'HepatitisB_3_', 'Hib_1_', 'Hib_2_', 'Hib_3_', 'Measles_', 'Yellow_Fever_', 'Vit_A_', 'Ill_last_week_', 'Ill_day_1_', 'Ill_day_2_', 'Ill_day_3_', 'Ill_day_4_', 'Ill_day_5_', 'Ill_day_6_', 'Ill_day_7_', 'Diarrhea_last_week_', 'Diarrhea_day_1_', 'Diarrhea_d

In [21]:
# Drop rows where target column ('free_care_check_') has NaN values
df_cleaned = df.dropna(subset=['free_care_check_'])

# Check new shape
print(f"Shape after dropping NaN target values: {df_cleaned.shape}")


Shape after dropping NaN target values: (1651, 130)


In [25]:
# Loop through each column and fill NaN values with the mode
for col in df_cleaned.columns:
    if df_cleaned[col].dtype == 'object':
        # For categorical columns, fill NaN with the mode
        df_cleaned.loc[:, col] = df_cleaned[col].fillna(df_cleaned[col].mode().iloc[0])
    else:
        # For numeric columns, fill NaN with the mean or another appropriate value
        df_cleaned.loc[:, col] = df_cleaned[col].fillna(df_cleaned[col].mean())


In [27]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Split the data
X = df_cleaned.drop(columns=['free_care_check_'])  # Features
y = df_cleaned['free_care_check_']  # Target

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize RandomForestClassifier
rf_model = RandomForestClassifier()

# ------------------- Method 1: RFE -------------------
rfe = RFE(estimator=rf_model, n_features_to_select=20)  # Selecting top 20 features
X_train_rfe = rfe.fit_transform(X_train, y_train)
selected_rfe = X_train.columns[rfe.support_].tolist()

# ------------------- Method 2: Random Forest Feature Importance -------------------
rf_model.fit(X_train, y_train)
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
top_rf_features = X_train.columns[indices[:20]].tolist()

# ------------------- Method 3: Lasso (L1 Regularization) -------------------
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)
lasso_selected_features = X_train.columns[lasso.coef_ != 0].tolist()

# ------------------- Results -------------------
print("Features selected by RFE:")
print(selected_rfe)

print("\nFeatures selected by Random Forest Feature Importance:")
print(top_rf_features)

print("\nFeatures selected by Lasso (L1 Regularization):")
print(lasso_selected_features)

# ------------------- Get Common Features -------------------
common_features = list(set(selected_rfe) & set(top_rf_features) & set(lasso_selected_features))
print("\nCommon Features selected by all methods:")
print(common_features)


Features selected by RFE:
['FPrimary', 'compound', 'TrtOrder', 'TrtOrder2013', 'HHmembers_12', 'LogAssets', 'dist1', 'ageyear_2014', 'age_2014_2', 'age_2014_3', 'age_2014_4', 'health_program_', 'health_program_name_', 'health_program_benefit_', 'health_worker_visits_check_', 'health_worker_provides_', 'child_weight_2_', 'weight', 'muac', 'MUAC']

Features selected by Random Forest Feature Importance:
['TrtOrder', 'health_program_', 'TrtOrder2013', 'health_program_name_', 'health_program_benefit_', 'health_worker_visits_check_', 'health_worker_provides_', 'FPrimary', 'LogAssets', 'dist1', 'compound', 'HHmembers_12', 'health_worker_last_2_weeks_', 'weight', 'ageyear_2014', 'age_2014', 'age_2014_2', 'age_2014_4', 'child_weight_2_', 'MUAC']

Features selected by Lasso (L1 Regularization):
['FPrimary', 'TrtOrder', 'TrtOrder2013', 'AttrStatus', 'HHmembers_12', 'LogAssets', 'age_2014', 'age_2014_2', 'age_2014_3', 'age_2014_4', 'vaccines_taken', 'health_program_', 'health_program_name_', 'heal

  model = cd_fast.enet_coordinate_descent(


In [29]:
# Ensure you use the correct variable (x_common)
X_common = ['age_2014_3', 'TrtOrder2013','TrtOrder', 'LogAssets',
            'health_worker_provides_', 'HHmembers_12', 'health_program_',
            'age_2014_2', 'FPrimary', 'weight', 'age_2014']

# Selecting only the common features
common_features_data = df_cleaned[X_common]
X_common = df_cleaned[['age_2014_3', 'TrtOrder2013', 'TrtOrder', 'LogAssets',
                       'health_worker_provides_', 'HHmembers_12', 'health_program_',
                       'age_2014_2', 'FPrimary', 'weight', 'age_2014']]

# Calculate the correlation matrix
correlation_matrix = common_features_data.corr()

# Print the correlation matrix
print(correlation_matrix)


                         age_2014_3  TrtOrder2013  TrtOrder  LogAssets  \
age_2014_3                 1.000000     -0.007673 -0.000861   0.023673   
TrtOrder2013              -0.007673      1.000000  0.978118   0.013851   
TrtOrder                  -0.000861      0.978118  1.000000   0.015805   
LogAssets                  0.023673      0.013851  0.015805   1.000000   
health_worker_provides_   -0.052199      0.073852  0.081438   0.055858   
HHmembers_12               0.089889      0.037460  0.040959   0.301484   
health_program_           -0.061767      0.530129  0.532394   0.053410   
age_2014_2                 0.985726     -0.008646 -0.001128   0.028609   
FPrimary                  -0.010931     -0.021829 -0.022893   0.012796   
weight                     0.717518     -0.007954 -0.003183   0.063557   
age_2014                   0.936421     -0.011003 -0.002254   0.034569   

                         health_worker_provides_  HHmembers_12  \
age_2014_3                             -0.052

In [31]:
X= df_cleaned[['age_2014', 'TrtOrder', 'LogAssets', 'health_worker_provides_', 
 'HHmembers_12', 'health_program_', 'FPrimary', 'weight']]
