In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load the datasets
train_df = pd.read_csv('Train_Data.csv')
test_df = pd.read_csv('Test_Data.csv')
sample_submission_df = pd.read_csv('Sample_Submission.csv')

# Display the first few rows of the training data
print("Train Data Head:")
print(train_df.head())

# Display information about the training data
print("\nTrain Data Info:")
print(train_df.info())

# Display the first few rows of the test data
print("\nTest Data Head:")
print(test_df.head())

# Display information about the test data
print("\nTest Data Info:")
print(test_df.info()) 

Train Data Head:
      SEQN  RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN age_group
0  73564.0       2.0     2.0    35.7   110.0     2.0   150.0  14.91     Adult
1  73568.0       2.0     2.0    20.3    89.0     2.0    80.0   3.85     Adult
2  73576.0       1.0     2.0    23.2    89.0     2.0    68.0   6.14     Adult
3  73577.0       1.0     2.0    28.9   104.0     NaN    84.0  16.15     Adult
4  73580.0       2.0     1.0    35.9   103.0     2.0    81.0  10.92     Adult

Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966 entries, 0 to 1965
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SEQN       1954 non-null   float64
 1   RIAGENDR   1948 non-null   float64
 2   PAQ605     1953 non-null   float64
 3   BMXBMI     1948 non-null   float64
 4   LBXGLU     1953 non-null   float64
 5   DIQ010     1948 non-null   float64
 6   LBXGLT     1955 non-null   float64
 7   LBXIN      1957 non-null  

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load the datasets
try:
    train_df = pd.read_csv('Train_Data.csv')
    test_df = pd.read_csv('Test_Data.csv')
    sample_submission_df = pd.read_csv('Sample_Submission.csv')
    print("Datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading files: {e}. Please ensure 'Train_Data.csv', 'Test_Data.csv', and 'Sample_Submission.csv' are in the same directory.")
    exit() # Exit if files are not found

# Display initial information to confirm loading and structure
print("\n--- Initial Data Info ---")
print("Train Data Info:")
train_df.info()
print("\nTest Data Info:")
test_df.info()

# Make copies to avoid SettingWithCopyWarning and preserve original dataframes
train_df_processed = train_df.copy()
test_df_processed = test_df.copy()

# Store SEQN from test_df_processed before any dropping/processing for final submission
# Ensure SEQN is integer type as per sample submission
test_seqn = test_df_processed['SEQN'].fillna(-1).astype(int) # Fill NaNs with -1 before converting to int if any exist temporarily

# Drop original SEQN from processing dataframes as it's an identifier and not a feature
train_df_processed = train_df_processed.drop('SEQN', axis=1, errors='ignore')
test_df_processed = test_df_processed.drop('SEQN', axis=1, errors='ignore')

print("\n--- After dropping SEQN ---")
print("Train Data Columns:", train_df_processed.columns.tolist())
print("Test Data Columns:", test_df_processed.columns.tolist())

# 1. Handle missing values
# Impute numerical columns with median
numerical_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
# Create a separate imputer for train and test to avoid issues if all values are NaN in a column
imputer_numerical_train = SimpleImputer(strategy='median')
imputer_numerical_test = SimpleImputer(strategy='median')

for col in numerical_cols:
    if col in train_df_processed.columns:
        train_df_processed[col] = imputer_numerical_train.fit_transform(train_df_processed[[col]])
    if col in test_df_processed.columns:
        test_df_processed[col] = imputer_numerical_train.transform(test_df_processed[[col]]) # Use train imputer for test data

# Impute categorical columns with most frequent (mode)
categorical_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']

# Impute 'age_group' in train_df_processed before encoding (target variable)
if 'age_group' in train_df_processed.columns:
    imputer_age_group = SimpleImputer(strategy='most_frequent')
    # Fix: Use .ravel() to flatten the 2D output of fit_transform to a 1D array
    train_df_processed['age_group'] = imputer_age_group.fit_transform(train_df_processed[['age_group']]).ravel()
    print(f"\nMissing values in 'age_group' handled: {train_df_processed['age_group'].isnull().sum()} remaining.")

imputer_categorical = SimpleImputer(strategy='most_frequent')
for col in categorical_cols:
    if col in train_df_processed.columns:
        # Fit on train, transform on train and test
        train_df_processed[col] = imputer_categorical.fit_transform(train_df_processed[[col]])
    if col in test_df_processed.columns:
        test_df_processed[col] = imputer_categorical.transform(test_df_processed[[col]])

print("\n--- After Imputation ---")
print("Train Data NaNs after numerical and categorical imputation:\n", train_df_processed.isnull().sum())
print("Test Data NaNs after numerical and categorical imputation:\n", test_df_processed.isnull().sum())


# 2. Encode target variable 'age_group' in training data
if 'age_group' in train_df_processed.columns:
    train_df_processed['age_group'] = train_df_processed['age_group'].map({'Adult': 0, 'Senior': 1})
    if train_df_processed['age_group'].isnull().any():
        print("Warning: Some 'age_group' values could not be mapped to 0 or 1. Re-imputing if any NaNs exist after mapping.")
        # If any values outside 'Adult'/'Senior' were present, they would become NaN after map.
        # Impute again if necessary (e.g., if there were values like 'Child' or 'Teenager')
        train_df_processed['age_group'].fillna(train_df_processed['age_group'].mode()[0], inplace=True)
    print("\n'age_group' mapped to numerical (0=Adult, 1=Senior).")
else:
    print("\n'age_group' column not found in training data. Cannot encode target.")
    exit() # Exit if target column is missing

# 3. Encode categorical features
# For RIAGENDR and PAQ605, LabelEncoder is suitable since they are binary/ordinal.
# Ensure columns exist before encoding
if 'RIAGENDR' in train_df_processed.columns:
    le_gender = LabelEncoder()
    train_df_processed['RIAGENDR'] = le_gender.fit_transform(train_df_processed['RIAGENDR'])
    if 'RIAGENDR' in test_df_processed.columns:
        test_df_processed['RIAGENDR'] = le_gender.transform(test_df_processed['RIAGENDR'])
else:
    print("Warning: 'RIAGENDR' not found in data for encoding.")

if 'PAQ605' in train_df_processed.columns:
    le_paq605 = LabelEncoder()
    train_df_processed['PAQ605'] = le_paq605.fit_transform(train_df_processed['PAQ605'])
    if 'PAQ605' in test_df_processed.columns:
        test_df_processed['PAQ605'] = le_paq605.transform(test_df_processed['PAQ605'])
else:
    print("Warning: 'PAQ605' not found in data for encoding.")

# For DIQ010, use OneHotEncoder
if 'DIQ010' in train_df_processed.columns:
    # Convert to string/object type first to treat values like 3 and 9 as distinct categories
    train_df_processed['DIQ010'] = train_df_processed['DIQ010'].astype(str)
    if 'DIQ010' in test_df_processed.columns:
        test_df_processed['DIQ010'] = test_df_processed['DIQ010'].astype(str)

    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    # Fit OneHotEncoder on the training data's 'DIQ010' column
    ohe.fit(train_df_processed[['DIQ010']])

    # Transform 'DIQ010' in both train and test data
    diq010_encoded_train = ohe.transform(train_df_processed[['DIQ010']])
    diq010_encoded_test = ohe.transform(test_df_processed[['DIQ010']])

    # Create DataFrames from the encoded arrays with appropriate column names
    diq010_feature_names = ohe.get_feature_names_out(['DIQ010'])

    diq010_df_train = pd.DataFrame(diq010_encoded_train, columns=diq010_feature_names, index=train_df_processed.index)
    diq010_df_test = pd.DataFrame(diq010_encoded_test, columns=diq010_feature_names, index=test_df_processed.index)

    # Concatenate the new one-hot encoded columns to the dataframes and drop the original 'DIQ010'
    train_df_processed = pd.concat([train_df_processed.drop('DIQ010', axis=1), diq010_df_train], axis=1)
    test_df_processed = pd.concat([test_df_processed.drop('DIQ010', axis=1), diq010_df_test], axis=1)
    print("\n'DIQ010' one-hot encoded.")
else:
    print("Warning: 'DIQ010' not found in data for encoding.")


# Ensure all columns are numeric after all encoding steps
# This handles cases where mixed types might have caused issues or new NaNs were introduced
for col in train_df_processed.columns:
    if train_df_processed[col].dtype == 'object':
        train_df_processed[col] = pd.to_numeric(train_df_processed[col], errors='coerce')

for col in test_df_processed.columns:
    if test_df_processed[col].dtype == 'object':
        test_df_processed[col] = pd.to_numeric(test_df_processed[col], errors='coerce')

# Final imputation for any NaNs introduced by to_numeric (should be minimal if preprocessing was correct)
train_df_processed.fillna(train_df_processed.median(), inplace=True)
test_df_processed.fillna(test_df_processed.median(), inplace=True)

print("\n--- After all preprocessing and final imputation ---")
print("Train Data Columns:", train_df_processed.columns.tolist())
print("Test Data Columns:", test_df_processed.columns.tolist())
print("Train Data NaNs:\n", train_df_processed.isnull().sum())
print("Test Data NaNs:\n", test_df_processed.isnull().sum())


# 4. Separate features and target
# Ensure 'age_group' exists in train_df_processed
if 'age_group' in train_df_processed.columns:
    X_train = train_df_processed.drop('age_group', axis=1)
    y_train = train_df_processed['age_group']
else:
    print("Error: 'age_group' column not found in processed training data.")
    exit()

X_test = test_df_processed

# Align columns - crucial for consistency between training and test sets after one-hot encoding
# Get all unique columns from both train and test features
# Exclude 'age_group' from this alignment as it's the target
train_features_cols = X_train.columns.tolist()
test_features_cols = X_test.columns.tolist()

all_common_features = sorted(list(set(train_features_cols) | set(test_features_cols)))

# Reindex both dataframes to have all_common_features, filling missing with 0
X_train = X_train.reindex(columns=all_common_features, fill_value=0)
X_test = X_test.reindex(columns=all_common_features, fill_value=0)

# Ensure the order of columns is exactly the same for training and testing
# This ensures that features map correctly to the model's learned weights
X_test = X_test[X_train.columns]

print("\n--- After Column Alignment ---")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("X_train columns (first 5):", X_train.columns.tolist()[:5])
print("X_test columns (first 5):", X_test.columns.tolist()[:5])


# 5. Train a classification model (RandomForestClassifier)
print("\nTraining RandomForestClassifier...")
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced') # Added class_weight for potential imbalance
model.fit(X_train, y_train)
print("Model training complete.")

# 6. Make predictions on the test set
predictions = model.predict(X_test)
print("\nPredictions made on the test set.")

# 7. Create the submission file
submission_df = pd.DataFrame({
    'SEQN': test_seqn, # Use the stored SEQN values
    'age_group': predictions
})

# Display the first few rows of the submission file
print("\n--- Submission File Head ---")
print(submission_df.head())

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' created successfully.") 

Datasets loaded successfully.

--- Initial Data Info ---
Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966 entries, 0 to 1965
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SEQN       1954 non-null   float64
 1   RIAGENDR   1948 non-null   float64
 2   PAQ605     1953 non-null   float64
 3   BMXBMI     1948 non-null   float64
 4   LBXGLU     1953 non-null   float64
 5   DIQ010     1948 non-null   float64
 6   LBXGLT     1955 non-null   float64
 7   LBXIN      1957 non-null   float64
 8   age_group  1952 non-null   object 
dtypes: float64(8), object(1)
memory usage: 138.4+ KB

Test Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      310 non-null    float64
 1   RIAGENDR  310 non-null    float64
 2   PAQ605    311 non-null    float64
 3   BMXBM