In [4]:

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, f_classif, chi2

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [5]:
# Binary classification

def LogisticRegressionClassifier(X, y):
    # Train-test split on reduced dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    print("Accuracy on test set:", np.round(model.score(X_test, y_test), 2))

In [6]:
# synthetic dataset
X, y = make_classification(n_samples=1000, n_features=47, n_informative=10, n_redundant=10, random_state=42)
feature_names = [f"F{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)


# Apply ANOVA
# This code selects the 8 best features from X that have the strongest statistical relationship with y 
# according to the ANOVA F-test, and returns a new dataset (X_new) with only those features.
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(df, y)

# Get mask of selected features
mask = selector.get_support()

# Get feature names
selected_features = df.columns[mask]

print("Selected Features:", list(selected_features))

Selected Features: ['F1', 'F2', 'F11', 'F14', 'F21', 'F28', 'F29', 'F33', 'F41', 'F43']


In [28]:
df.head()

Unnamed: 0,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39,F40,F41,F42,F43,F44,F45,F46
0,0.310183,-1.181494,8.451971,1.536318,-3.497855,-1.848938,0.795528,1.158258,-0.008034,0.607151,-1.67267,3.479025,-4.30157,-0.632001,5.459591,-0.165154,-1.991389,0.797577,-1.166692,2.300031,0.05804,5.252216,2.250834,-4.306026,-4.17799,0.534637,-1.605325,2.586582,0.097168,-0.380079,0.845775,-0.867414,-1.538283,2.479363,4.594765,-0.370596,0.227319,0.883972,-1.458576,-0.492933,0.535121,2.749239,1.140516,-1.090731,0.205078,0.119392,-1.96243
1,-0.133084,-1.820361,1.419805,-1.413178,-3.898386,-0.614741,-0.768566,-1.49514,-0.861664,-1.161647,-1.20032,-1.250796,-0.802254,-1.447462,-3.975316,0.568964,-2.789657,-0.494527,-2.002143,-0.78721,0.123355,-2.667795,1.196681,2.237242,-5.090857,-0.699857,0.486636,2.045996,0.634656,1.365733,-1.138262,-0.134049,-0.822407,-0.76374,-1.788025,0.094427,-0.65284,1.273947,1.777867,0.20194,0.139563,0.725891,-0.459335,1.109015,-1.053523,1.204408,-2.102305
2,-0.654693,-2.701127,2.721052,-0.435622,0.247819,0.477115,1.570913,0.638924,-1.149682,0.600727,-1.874234,0.929413,-1.139257,-0.048996,1.516749,2.156754,-0.532644,-2.678142,1.233051,0.640486,-0.043633,1.563717,1.256876,-1.12803,-0.763606,0.498777,0.213046,2.429807,-1.267781,0.735425,-1.193629,-1.385564,-0.187525,-0.099508,1.881073,0.204864,-0.501733,1.356579,1.514144,1.266255,0.559491,0.426305,-1.667828,1.47502,-0.335583,0.055036,0.41688
3,0.47224,2.37209,-6.50662,-0.533166,5.053146,-1.061501,0.102947,0.10275,-1.345099,0.113081,0.367287,-3.094282,4.243938,-0.349306,-4.03753,2.257657,0.148109,-1.507955,0.273927,0.508505,-0.303572,-2.078394,-3.3525,4.631103,2.270744,0.200023,0.649415,-4.265804,1.947299,1.190073,-2.078918,1.435191,-0.916036,-0.235777,-5.410444,2.500261,-2.549185,-1.297347,-0.270232,0.341468,1.853965,-0.314924,0.428283,-0.189954,0.255016,0.297541,0.172006
4,1.64438,1.817353,0.382008,-0.113275,-2.946047,1.409988,-1.783211,1.017793,0.478591,0.842158,1.700289,1.074616,-4.805785,-0.907379,3.724114,0.35117,1.721559,-0.932131,-0.062231,-0.965492,-1.463819,1.261516,2.139374,-1.689374,-4.108095,-2.017289,0.330853,3.016239,-1.686524,-0.434203,-2.267558,0.30397,0.564117,2.131493,6.368924,-0.722135,-0.087678,0.547875,0.973957,0.422485,-0.327008,-0.898573,1.057185,-2.319969,1.206115,0.112992,0.617621


In [12]:
df.shape

(1000, 47)

In [34]:
print("\nLogistic Regression on Selected Features:")
LogisticRegressionClassifier(X_new, y)

print("\nLogistic Regression on All Features:")
LogisticRegressionClassifier(X, y)


Logistic Regression on Selected Features:
Accuracy on test set: 0.83

Logistic Regression on All Features:
Accuracy on test set: 0.81


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Load CSV
data = pd.read_csv("data/health_lifestyle_classification.csv")

# Separate features and target
X = data.drop(columns=["target"])   # replace with your actual target column name
y = data["target"]

# One-hot encode all categorical columns automatically
X = pd.get_dummies(X, drop_first=True)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Feature importance
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

print("Top features by Random Forest importance:")
print(importances.head(5))

threshold = 0.037
selected_features = importances[importances > threshold].index.tolist()
print("\nSelected features (importance > 0.037):", selected_features)


Top features by Random Forest importance:
heart_rate        0.039905
insulin           0.039512
blood_pressure    0.039471
daily_steps       0.038971
income            0.038379
dtype: float64

Selected features (importance > 0.02): ['heart_rate', 'insulin', 'blood_pressure', 'daily_steps', 'income', 'daily_supplement_dosage', 'sugar_intake', 'glucose', 'work_hours', 'cholesterol', 'survey_code', 'sleep_hours', 'waist_size', 'calorie_intake', 'water_intake', 'screen_time', 'height', 'physical_activity', 'weight', 'age', 'bmi_corrected', 'bmi_estimated', 'bmi', 'bmi_scaled']
