In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
# Load the data
df = pd.read_csv('Train_Data.csv')
# Handle missing values - drop rows with missing values for numerical columns
numeric_cols = [ 'BMXBMI', 'LBXGLU', 'LBXGLT', 'DIQ010' , 'SEQN' , 'LBXIN']
for col in numeric_cols:
    df = df.dropna(subset=[col])

# For categorical columns - fill with mode (most frequent value)
categorical_cols = ['age_group']  # Add your categorical column names here
for col in categorical_cols:
    if col in df.columns:
        # Calculate mode while ignoring NaN values
        col_mode = df[col].mode()[0]  # [0] gets the first mode if multiple exist
        # Fill NaN values with the mode
        df[col].fillna(col_mode, inplace=True)# Encode categorical variables
le = LabelEncoder()
df['RIAGENDR'] = le.fit_transform(df['RIAGENDR'])  # Male=1, Female=0
df['PAQ605'] = df['PAQ605'].fillna(df['PAQ605'].mode()[0])  # Fill missing with mode

# Features and target
X = df[['SEQN' , 'RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU' , 'DIQ010' , 'LBXGLT', 'LBXIN']]
y = df['age_group']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(col_mode, inplace=True)# Encode categorical variables


In [8]:
df.isnull().sum()

SEQN         0
RIAGENDR     0
PAQ605       0
BMXBMI       0
LBXGLU       0
DIQ010       0
LBXGLT       0
LBXIN        0
age_group    0
dtype: int64

In [9]:
df

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,1,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,1,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
4,73580.0,1,1.0,35.9,103.0,2.0,81.0,10.92,Adult
5,73581.0,0,2.0,23.6,110.0,2.0,100.0,6.08,Adult
...,...,...,...,...,...,...,...,...,...
1960,83707.0,0,2.0,22.5,98.0,2.0,79.0,3.51,Adult
1961,83711.0,1,2.0,33.5,100.0,2.0,73.0,6.53,Adult
1962,83712.0,0,2.0,30.0,93.0,2.0,208.0,13.02,Adult
1963,83713.0,0,2.0,23.7,103.0,2.0,124.0,21.41,Adult


In [10]:
# Initialize and train the model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
#Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.84

Classification Report:
              precision    recall  f1-score   support

       Adult       0.85      0.98      0.91       473
      Senior       0.50      0.11      0.18        93

    accuracy                           0.84       566
   macro avg       0.67      0.54      0.54       566
weighted avg       0.79      0.84      0.79       566



In [9]:
# Initialize and train the model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
#Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.84

Classification Report:
              precision    recall  f1-score   support

       Adult       0.85      0.98      0.91       473
      Senior       0.50      0.11      0.18        93

    accuracy                           0.84       566
   macro avg       0.67      0.54      0.54       566
weighted avg       0.79      0.84      0.79       566



In [12]:
test_data = pd.read_csv("Test_Data.csv")
test_data.shape

(312, 8)

In [13]:
y_test = model.predict(test_data)


In [14]:
le = LabelEncoder()
y_decoded = le.fit_transform(y_test)

In [15]:
y_decoded

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [16]:
result = pd.DataFrame({
    
    'age_group': y_decoded
})

In [17]:
result

Unnamed: 0,age_group
0,0
1,0
2,0
3,0
4,0
...,...
307,0
308,0
309,0
310,0


In [21]:
result.to_csv("submission.csv", index=False)