In [39]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


train_data = pd.read_csv('hacktrain.csv')
test_data = pd.read_csv('hacktest.csv')

train_data['age_group'] = train_data['age_group'].map({'Adult': 0, 'Senior': 1})


features = ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']
target = 'age_group'


train_data = train_data.dropna(subset=[target])


X = train_data[features]
y = train_data[target]
X_test = test_data[features]

print("NaNs in train features:\n", X.isnull().sum())
print("NaNs in test features:\n", X_test.isnull().sum())
print("NaNs in target:\n", y.isnull().sum())


imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)


print("Any NaNs in X_imputed?", np.isnan(X_imputed).any())
print("Any NaNs in X_test_imputed?", np.isnan(X_test_imputed).any())


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)


X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)


val_preds = model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
print(f'Validation Accuracy: {val_acc:.4f}')


test_preds = model.predict(X_test_scaled)


submission = pd.DataFrame({
    
    'age_group': test_preds
})
submission.to_csv('hackathon_submission.csv', index=False)
print("Submission saved as hackathon_submission.csv")




NaNs in train features:
 RIAGENDR    18
PAQ605      13
BMXBMI      18
LBXGLU      13
DIQ010      18
LBXGLT      11
LBXIN        9
dtype: int64
NaNs in test features:
 RIAGENDR    2
PAQ605      1
BMXBMI      1
LBXGLU      1
DIQ010      1
LBXGLT      2
LBXIN       1
dtype: int64
NaNs in target:
 0
Any NaNs in X_imputed? False
Any NaNs in X_test_imputed? False


Validation Accuracy: 0.8082
Submission saved as hackathon_submission.csv
