In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e26/sample_submission.csv
/kaggle/input/playground-series-s3e26/train.csv
/kaggle/input/playground-series-s3e26/test.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss





In [3]:
# Load the dataset
df = pd.read_csv('/kaggle/input/playground-series-s3e26/train.csv')

In [4]:
# Separate features (X) and target variable (y)
X = df.drop(['id', 'Status'], axis=1)
y = df['Status']

In [5]:
# Train-Validation Split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
numerical_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_transformer = SimpleImputer(strategy='mean')  # You can change the strategy as needed
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # You can change the strategy as needed
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [7]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [8]:
 

model = RandomForestClassifier(random_state=42)

# Create and train the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

param_grid = {
    'preprocessor__num__strategy': ['mean', 'median'],
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20],
    
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_log_loss', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Model Evaluation
y_pred_proba = best_model.predict_proba(X_valid)
loss = log_loss(y_valid, y_pred_proba)
print(f'Log Loss on Validation Set: {loss}')

Log Loss on Validation Set: 0.4636426637802973


In [9]:
 test_data = pd.read_csv('/kaggle/input/playground-series-s3e26/test.csv')
test_predictions = best_model.predict_proba(test_data.drop(['id'], axis=1))

# Create the submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'Status_C': test_predictions[:, 0],
    'Status_CL': test_predictions[:, 1],
    'Status_D': test_predictions[:, 2]
})

submission.to_csv('/kaggle/working/submission.csv', index=False)