In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load datasets
train_df = pd.read_csv("Train_Data.csv")
test_df = pd.read_csv("Test_Data.csv")
sample_submission = pd.read_csv("Sample_Submission.csv")

# Map target variable: Adult = 0, Senior = 1
train_df['age_group'] = train_df['age_group'].map({'Adult': 0, 'Senior': 1})

# Drop rows with missing target values
train_df = train_df[train_df['age_group'].isin([0, 1])]

# Split features and target
X = train_df.drop(['SEQN', 'age_group'], axis=1)
y = train_df['age_group']

# Handle missing values using median imputation
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Train Random Forest model
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = model.predict(X_val)
f1 = f1_score(y_val, y_val_pred)
print("Validation F1 Score:", f1)

# --- Predict on Test Set ---
X_test = test_df.drop(['SEQN'], axis=1)
X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)

test_predictions = model.predict(X_test_scaled)

# --- Prepare submission ---
submission = pd.DataFrame({'age_group': test_predictions})
submission.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")


Validation F1 Score: 0.19047619047619047
Submission file saved as submission.csv
