In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# ================================
# 1. LOAD DATA
# ================================
train = pd.read_csv("/kaggle/input/ai-201-b-mse-2-aiml-a/train.csv")
test = pd.read_csv("/kaggle/input/ai-201-b-mse-2-aiml-a/test.csv")

# ================================
# 2. BASIC EDA
# ================================
print(train.head())
print(train.info())
print(train.isnull().sum())
print(train['NObeyesdad'].value_counts())

# ================================
# 3. SEPARATE FEATURES & TARGET (must be done BEFORE EDA plots)
# ================================
y = train["NObeyesdad"]
X = train.drop("NObeyesdad", axis=1)

num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# But if your new dataset has:

# boolean values

# category dtype

# date column

# then modify like this:

# cat_cols = X.select_dtypes(include=['object','bool','category']).columns

# ================================
# 4. VISUALIZATION (NOW X exists, so no NameError)
# ================================

# Null values
plt.figure(figsize=(12,4))
train.isnull().sum().plot(kind='bar')
plt.title("Missing Values per Column")
plt.show()

# Target distribution
plt.figure(figsize=(6,4))
sns.countplot(x=train['NObeyesdad'])
plt.title("Target Class Distribution")
plt.show()

# Numeric histograms
X[num_cols].hist(figsize=(14,10))
plt.suptitle("Numeric Feature Distributions")
plt.show()

# Correlation heatmap
plt.figure(figsize=(12,6))
sns.heatmap(X[num_cols].corr(), cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# ================================
# 5. HANDLE NULL VALUES
# ================================
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
test[num_cols] = test[num_cols].fillna(test[num_cols].median())

X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])
test[cat_cols] = test[cat_cols].fillna(test[cat_cols].mode().iloc[0])

# ================================
# 6. RESET INDEX
# ================================
X = X.reset_index(drop=True)
test = test.reset_index(drop=True)

# ================================
# 7. LABEL ENCODE TARGET
# ================================
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("Label Mapping:", le.classes_)  # ['C' 'CL' 'D']

# ================================
# 8. ONEHOT ENCODE FEATURES + RANDOM FOREST MODEL
# ================================
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(random_state=42))
])

# ================================
# 9. TRAIN VALIDATION SPLIT
# ================================
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

# ================================
# 10. TRAIN MODEL
# ================================
model.fit(X_train, y_train)

# ================================
# 11. EVALUATION
# ================================
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)

print("Accuracy :", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred, average='macro'))
print("Recall   :", recall_score(y_val, y_pred, average='macro'))
print("F1 Score :", f1_score(y_val, y_pred, average='macro'))
print("ROC AUC  :", roc_auc_score(y_val, y_prob, multi_class='ovr'))

# ================================
# 12. TRAIN FULL MODEL
# ================================
model.fit(X, y_encoded)

# ================================
# 13. PREDICT TEST SET
# ================================
test_pred = model.predict(test)
test_pred_labels = le.inverse_transform(test_pred)
# if to predict numbers theno only this
# test_pred = model.predict_proba(test)

# ================================
# 14. CREATE FINAL SUBMISSION FILE
# ================================
submission = pd.DataFrame({
    "id": test["id"],
    "NObeyesdad":  test_pred_labels,
})

print("Duplicate IDs:", submission["id"].duplicated().sum())

submission.to_csv("submission_random_forest.csv", index=False)
print("submission_random_forest.csv CREATED SUCCESSFULLY!")
submission.head()
