In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# Load CSVs
df = pd.read_csv('../data/train_df.csv')
df_test = pd.read_csv('../data/test_df.csv')

# Preprocessing: Example encoding + scaling
le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])
df_test['gender'] = le_gender.transform(df_test['gender'])

# Repeat for other categorical columns
for col in ['primary_diagnosis', 'discharge_to']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    df_test[col] = le.transform(df_test[col])

X = df.drop('readmitted', axis=1)
y = df['readmitted']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(df_test)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Model training
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)

# Validation
y_pred_val = rf.predict(X_val)
print('Precision:', precision_score(y_val, y_pred_val))
print('Recall:', recall_score(y_val, y_pred_val))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred_val))

# Final Submission
test_preds = rf.predict(X_test_scaled)
submission = pd.DataFrame({
    'Patient_ID': range(1, len(test_preds) + 1),
    'readmitted': test_preds
})
submission.to_csv('../data/final_submission.csv', index=False)
