In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Step 1: Load the data
train_data = pd.read_csv("Train_Data.csv")
test_data = pd.read_csv("Test_Data.csv")

In [None]:
# Handle missing values in training data
train_imputer = SimpleImputer(strategy="most_frequent")
train_data_imputed = pd.DataFrame(train_imputer.fit_transform(train_data), columns=train_data.columns)

# Handle missing values in test data
test_imputer = SimpleImputer(strategy="most_frequent")
test_data_imputed = pd.DataFrame(test_imputer.fit_transform(test_data), columns=test_data.columns)


In [None]:
# Step 2: Prepare the data
X_train = train_data_imputed.drop("Healthy", axis=1)
y_train = train_data_imputed["Healthy"]

X_test = test_data_imputed

In [None]:
# Handle categorical variables
categorical_cols = ["Food preference", "Smoker?", "Living in?", "Any heriditary condition?", "Follow Diet"]

# Label encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Apply label encoding to categorical columns in the input features
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in categorical_cols:
    X_train_encoded[col] = label_encoder.fit_transform(X_train_encoded[col])
    X_test_encoded[col] = label_encoder.transform(X_test_encoded[col])


In [None]:
# Scale the numerical features
scaler = StandardScaler()
numerical_cols = list(set(X_train_encoded.columns) - set(categorical_cols))
X_train_encoded[numerical_cols] = scaler.fit_transform(X_train_encoded[numerical_cols])
X_test_encoded[numerical_cols] = scaler.transform(X_test_encoded[numerical_cols])


In [None]:
# Step 3: Train the model
model = RandomForestClassifier()
model.fit(X_train_encoded, y_train_encoded)

In [None]:
# Step 4: Evaluate the model
y_train_pred = model.predict(X_train_encoded)
train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
print("Training accuracy:", train_accuracy)

Training accuracy: 1.0


In [None]:
# Step 5: Make predictions on the test data
y_test_pred = model.predict(X_test_encoded)
y_test_pred = y_test_pred.astype(int)  # Convert predictions to integer format

In [None]:
# Step 6: Prepare the submission file
submission_df = pd.DataFrame({"predictions": y_test_pred})
submission_df.to_csv("submission_random.csv", index=False)