In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aimprove-contest-week-2/Test-set-AImprove-2- detailed.csv
/kaggle/input/aimprove-contest-week-2/Test-set-AImprove-2..csv
/kaggle/input/aimprove-contest-week-2/Train-set-AImprove2.csv


In [2]:
# Read the data
train_data = pd.read_csv('/kaggle/input/aimprove-contest-week-2/Train-set-AImprove2.csv')
test_data = pd.read_csv('/kaggle/input/aimprove-contest-week-2/Test-set-AImprove-2- detailed.csv')

# Fill missing values in 'Events' column with 'No Event Occurred'
train_data['Events'].fillna('No Event Occurred', inplace=True)

# Replace non-numeric values with NaN in 'PrecipitationSumInches' column
train_data['PrecipitationSumInches'] = pd.to_numeric(train_data['PrecipitationSumInches'], errors='coerce')

# Drop rows with NaN values in the target column
train_data.dropna(subset=['PrecipitationSumInches'], inplace=True)

X_train = train_data.drop(columns=['PrecipitationSumInches', 'Date'])
y_train = train_data['PrecipitationSumInches']

# Define numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Preprocessing using ColumnTransformer
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Models
models = [
    ('RandomForest', RandomForestRegressor(random_state=0)),
    ('GradientBoosting', GradientBoostingRegressor(random_state=0)),
    ('DecisionTree', DecisionTreeRegressor(random_state=0))
]

# Define hyperparameters grid for each model
params = {
    'RandomForest': {
        'n_estimators': [200, 300, 400],
        'max_depth': [20, 25, 30],
        'min_samples_split': [2, 3, 4]
    },
    'GradientBoosting': {
        'n_estimators': [100, 150, 200],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.05, 0.1, 0.2]
    },
    'DecisionTree': {
        'max_depth': [15, 20, 25],
        'min_samples_split': [2, 3, 4]
    }
}

# Combine preprocessing with models in a pipeline
best_models = {}

for model_name, model in models:
    param_grid = {f'{model_name}__{key}': value for key, value in params[model_name].items()}
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), (model_name, model)])
    
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    print(f"Best Parameters for {model_name}:", grid_search.best_params_)
    print(f"Best MSE for {model_name}:", -grid_search.best_score_)
    
    best_models[model_name] = grid_search.best_estimator_

# Further Model Tuning or Ensemble Methods can be applied here to improve performance
# You can also add more models and explore different hyperparameters

# Process test data
test_data['Events'] = train_data['Events']  # Use the same 'Events' column from the training set
X_test_processed = test_data.drop(columns=['Date'], errors='ignore')

# Predict on the test set using the best models
predictions = {}
for model_name, model in best_models.items():
    test_predictions = model.predict(X_test_processed)
    predictions[model_name] = test_predictions

# Create submission DataFrames and save to CSV
for model_name, test_predictions in predictions.items():
    submission_df = pd.DataFrame({
        'Date': test_data['Date'],
        f'{model_name}_Predictions': test_predictions
    })
    
    submission_df = submission_df.drop_duplicates(subset=['Date']).dropna(subset=['Date'])
    
    submission_df.to_csv(f'/kaggle/working/{model_name}_Test_Predictions.csv', index=False)


Best Parameters for RandomForest: {'RandomForest__max_depth': 20, 'RandomForest__min_samples_split': 3, 'RandomForest__n_estimators': 400}
Best MSE for RandomForest: 0.1067415899615745
Best Parameters for GradientBoosting: {'GradientBoosting__learning_rate': 0.05, 'GradientBoosting__max_depth': 3, 'GradientBoosting__n_estimators': 100}
Best MSE for GradientBoosting: 0.10684482165255978
Best Parameters for DecisionTree: {'DecisionTree__max_depth': 20, 'DecisionTree__min_samples_split': 4}
Best MSE for DecisionTree: 0.15619275960765844
