In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load datasets
train_path = "/kaggle/input/playground-series-s5e3/train.csv"
test_path = "/kaggle/input/playground-series-s5e3/test.csv"
submission_path = "/kaggle/input/playground-series-s5e3/sample_submission.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
submission_df = pd.read_csv(submission_path)

# Handle missing values
imputer = SimpleImputer(strategy="mean")

# Extract features and target
X = train_df.drop(columns=['id', 'rainfall'])  # Assuming 'rainfall' is the target variable
y = train_df['rainfall']
X_test = test_df.drop(columns=['id'])

# Impute missing values
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate model
valid_preds = model.predict_proba(X_valid_scaled)[:, 1]
auc_score = roc_auc_score(y_valid, valid_preds)
print(f'Validation AUC: {auc_score:.4f}')

# Make predictions on test data
test_preds = model.predict_proba(X_test_scaled)[:, 1]

# Prepare submission
submission_df['rainfall'] = test_preds
submission_df.to_csv("/kaggle/working/submission.csv", index=False)

print("Submission file saved as /kaggle/working/submission.csv")


Validation AUC: 0.8572
Submission file saved as /kaggle/working/submission.csv
