<a href="https://colab.research.google.com/github/Ashish-Rawat7/Nutrition-Health-Survey--Age-Prediction/blob/main/Age_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [3]:
train = pd.read_csv("/content/Train_Data.csv")
test = pd.read_csv("/content/Test_Data.csv")

In [4]:
train.head()

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult


In [5]:
test.head()

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,77017.0,1.0,1.0,32.2,96.0,2.0,135.0,15.11
1,75580.0,2.0,2.0,26.3,100.0,2.0,141.0,15.26
2,73820.0,1.0,2.0,28.6,107.0,2.0,136.0,8.82
3,80489.0,2.0,1.0,22.1,93.0,2.0,111.0,12.13
4,82047.0,1.0,1.0,24.7,91.0,2.0,105.0,3.12


In [6]:
train = train.dropna(subset=['age_group'])

In [7]:
for col in train.columns:
    if col != 'age_group':
        if train[col].dtype != 'object':
            median_val = train[col].median()
            train.loc[:, col] = train[col].fillna(median_val)
            test.loc[:, col] = test[col].fillna(median_val)
        else:
            mode_val = train[col].mode()[0]
            train.loc[:, col] = train[col].fillna(mode_val)
            test.loc[:, col] = test[col].fillna(mode_val)

In [8]:
train = train[train['age_group'].isin(['Adult', 'Senior'])].copy()
train.loc[:, 'age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1}).astype(int)

In [9]:
def feature_engineering(df):
    df = df.copy()

    if 'BMXBMI' in df.columns and 'RIDAGEYR' in df.columns:
        df['BMI_per_age'] = df['BMXBMI'] / (df['RIDAGEYR'] + 1)

    if 'LBXGLU' in df.columns and 'LBXIN' in df.columns:
        df['GLU_INS'] = df['LBXGLU'] * df['LBXIN']

    if 'PAQ605' in df.columns:
        df['PAQ605_binary'] = df['PAQ605'].map({1: 1, 2: 0})

    if 'DIQ010' in df.columns:
        df['is_diabetic'] = df['DIQ010'].apply(lambda x: 1 if x == 1 else 0)

    return df

In [10]:
train = feature_engineering(train)
test = feature_engineering(test)

In [11]:
test_ids = test['SEQN']
train = train.drop(columns=['SEQN'])
test = test.drop(columns=['SEQN'])

In [12]:
X = train.drop(columns=['age_group'])
y = train['age_group']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test)

In [14]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=250,
    max_depth=5,
    learning_rate=0.1,
    eval_metric='logloss',  # still valid
    random_state=42
)

model.fit(X_train_scaled, y_train)


In [15]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)  # MUST use transform, not fit_transform

model.fit(X_train_scaled, y_train_enc)
y_pred = model.predict(X_val_scaled)

accuracy = accuracy_score(y_val_enc, y_pred)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

Validation Accuracy: 85.42%


In [16]:
test_predictions = model.predict(test_scaled)

In [17]:
submission = pd.DataFrame({'age_group': test_predictions})
submission.to_csv("submission_xgboost.csv", index=False)
print("Submission saved as 'submission_xgboost.csv'")

Submission saved as 'submission_xgboost.csv'
