In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load train and test data
train_df = pd.read_csv("Train_Data.csv")
test_df = pd.read_csv("Test_Data.csv")

# ❗ Drop rows where age_group is missing
train_df = train_df.dropna(subset=["age_group"])

# Save test IDs for submission
test_ids = test_df["SEQN"]

# Drop SEQN
train_df.drop(columns=["SEQN"], inplace=True)
test_df.drop(columns=["SEQN"], inplace=True)

# Encode target: Adult = 0, Senior = 1
train_df["age_group"] = train_df["age_group"].map({"Adult": 0, "Senior": 1})

# Separate X and y
X = train_df.drop("age_group", axis=1)
y = train_df["age_group"]

# Impute missing values in features
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)
test_imputed = imputer.transform(test_df)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1)
grid.fit(X_train, y_train)

# Use the best model
model = grid.best_estimator_

# Validate
y_pred_val = model.predict(X_val)
print("GridSearch Accuracy:", accuracy_score(y_val, y_pred_val))

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

models = [
    ("Logistic", LogisticRegression(max_iter=1000)),
    ("GradientBoost", GradientBoostingClassifier()),
    ("SVM", SVC()),
    ("KNN", KNeighborsClassifier()),
]

print("\nTesting Other Models:")
for name, clf in models:
    clf.fit(X_train, y_train)
    acc = clf.score(X_val, y_val)
    print(f"{name} Accuracy: {acc:.4f}")



# Train model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
from sklearn.metrics import accuracy_score
y_pred_val = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))


# Save predictions from each model into separate CSV files
model_variants = [
    ("RandomForest", grid.best_estimator_),
    ("LogisticRegression", LogisticRegression(max_iter=1000)),
    ("GradientBoosting", GradientBoostingClassifier()),
    ("SVM", SVC()),
    ("KNN", KNeighborsClassifier())
]

print("\n📁 Saving submission files for all models:")

for name, model in model_variants:
    model.fit(X_train, y_train)
    preds = model.predict(test_imputed)

    submission = pd.DataFrame({"age_group": preds})
    filename = f"submission_{name.replace(' ', '')}.csv"
    submission.to_csv(filename, index=False)

    print(f"✅ {filename} saved")


GridSearch Accuracy: 0.8516624040920716

Testing Other Models:
Logistic Accuracy: 0.8670
GradientBoost Accuracy: 0.8645
SVM Accuracy: 0.8696
KNN Accuracy: 0.8491
Validation Accuracy: 0.8593350383631714

📁 Saving submission files for all models:
✅ submission_RandomForest.csv saved
✅ submission_LogisticRegression.csv saved
✅ submission_GradientBoosting.csv saved
✅ submission_SVM.csv saved
✅ submission_KNN.csv saved


In [None]:
import pandas as pd

# List of filenames you generated
files = [
    "submission_RandomForest.csv",
    "submission_LogisticRegression.csv",
    "submission_GradientBoosting.csv",
    "submission_SVM.csv",
    "submission_KNN.csv"
]

print("🔢 Counts of Adults (0) and Seniors (1) in each file:\n")

for file in files:
    try:
        df = pd.read_csv(file)
        counts = df["age_group"].value_counts().sort_index()  # 0 first, then 1
        adults = counts.get(0, 0)
        seniors = counts.get(1, 0)
        print(f"{file}: Adults (0) = {adults}, Seniors (1) = {seniors}")
    except FileNotFoundError:
        print(f"❌ {file} not found.")


🔢 Counts of Adults (0) and Seniors (1) in each file:

submission_RandomForest.csv: Adults (0) = 296, Seniors (1) = 16
submission_LogisticRegression.csv: Adults (0) = 305, Seniors (1) = 7
submission_GradientBoosting.csv: Adults (0) = 292, Seniors (1) = 20
submission_SVM.csv: Adults (0) = 312, Seniors (1) = 0
submission_KNN.csv: Adults (0) = 285, Seniors (1) = 27
