In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from xgboost import XGBClassifier, plot_importance
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [31]:
train_df = pd.read_csv("Train_Data.csv")
test_df = pd.read_csv("Test_Data.csv")
sample_submission = pd.read_csv("Sample_Submission.csv")

In [32]:
train_df.drop(columns=["SEQN"], inplace=True)
test_df.drop(columns=["SEQN"], inplace=True)
train_df = train_df.dropna(subset=["age_group"])

In [33]:
y = train_df["age_group"].map({"Adult": 0, "Senior": 1})
X = train_df.drop(columns=["age_group"])
X_test = test_df.copy()

In [34]:
cat_cols = ["RIAGENDR", "PAQ605", "DIQ010"]
num_cols = [col for col in X.columns if col not in cat_cols]

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

# Apply preprocessing
X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

In [35]:
X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

In [36]:
from imblearn.combine import SMOTETomek
smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X_proc, y)

In [37]:
from collections import Counter
from sklearn.model_selection import StratifiedKFold, cross_val_score
class_counts = Counter(y)
scale_pos_weight = class_counts[0] / class_counts[1]

model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring="f1_macro")
print("Cross-Validated F1 Macro Score:", scores.mean())


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Cross-Validated F1 Macro Score: 0.7839777296126319


In [38]:
model.fit(X_resampled, y_resampled)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [40]:
y_test_pred = model.predict(X_test_proc)
submission = pd.DataFrame({"age_group": y_test_pred})
submission.to_csv("submission1.csv", index=False)
print("✅ Final submission1.csv saved.")

✅ Final submission1.csv saved.
