In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV




# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aimprove-contest-week-2/Test-set-AImprove-2 - Detailed.csv
/kaggle/input/aimprove-contest-week-2/Test-set-AImprove-2..csv
/kaggle/input/aimprove-contest-week-2/Train-set-AImprove2.csv


In [2]:

train_data = pd.read_csv('/kaggle/input/aimprove-contest-week-2/Train-set-AImprove2.csv')
test_data = pd.read_csv('/kaggle/input/aimprove-contest-week-2/Test-set-AImprove-2 - Detailed.csv')

train_data['Events'].fillna('No Event Occurred', inplace=True)

train_data['PrecipitationSumInches'].replace('T', 0.001, inplace=True)


X_train = train_data.drop(columns=['PrecipitationSumInches', 'Date'])
y_train = train_data['PrecipitationSumInches']


categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']


numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, X_train.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Random Forest Regressor model
model = RandomForestRegressor(random_state=42)

param_grid = {
    'model__n_estimators': [200, 300, 400],
    'model__max_depth': [20, 30, 40],
    'model__min_samples_split': [2, 3, 4],
    # Add more hyperparameters to the grid for tuning
}



clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

random_search = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=10,scoring='neg_mean_squared_error', random_state=42)

random_search.fit(X_train, y_train)


# Get best estimator from the search
best_model = random_search.best_estimator_

# Train the best model on the entire training data
best_model.fit(X_train, y_train)


# Calculate MSE on training set
train_predictions = best_model.predict(X_train)
train_mse = mean_squared_error(y_train, train_predictions)
print("Mean Squared Error using RandomForestRegressor on Training Set: ", train_mse)

Mean Squared Error using RandomForestRegressor on Training Set:  0.01568200038572775


In [3]:
print("Training Data Columns:")
print(X_train.columns)


print("Test Data Columns:")
print(test_data.columns)

Training Data Columns:
Index(['TempHighF', 'TempAvgF', 'TempLowF', 'DewPointHighF', 'DewPointAvgF',
       'DewPointLowF', 'HumidityHighPercent', 'HumidityAvgPercent',
       'HumidityLowPercent', 'SeaLevelPressureHighInches',
       'SeaLevelPressureAvgInches', 'SeaLevelPressureLowInches',
       'VisibilityHighMiles', 'VisibilityAvgMiles', 'VisibilityLowMiles',
       'WindHighMPH', 'WindAvgMPH', 'WindGustMPH', 'Events'],
      dtype='object')
Test Data Columns:
Index(['Date', 'TempHighF', 'TempAvgF', 'TempLowF', 'DewPointHighF',
       'DewPointAvgF', 'DewPointLowF', 'HumidityHighPercent',
       'HumidityAvgPercent', 'HumidityLowPercent',
       'SeaLevelPressureHighInches', 'SeaLevelPressureAvgInches',
       'SeaLevelPressureLowInches', 'VisibilityHighMiles',
       'VisibilityAvgMiles', 'VisibilityLowMiles', 'WindHighMPH', 'WindAvgMPH',
       'WindGustMPH', 'PrecipitationSumInches'],
      dtype='object')


In [4]:
test_data['Events'] = train_data['Events']  # Use the same 'Events' column from the training set

X_test_processed = test_data.drop(columns=['Date'], errors='ignore')
test_predictions = best_model.predict(X_test_processed)


In [5]:
submission_df = pd.DataFrame({
    'Date': test_data['Date'],
    'Predictions': test_predictions
})

In [6]:
submission_df = submission_df.drop_duplicates(subset=['Date'])
submission_df = submission_df[submission_df['Date'] != '']

In [7]:
submission_df.to_csv('/kaggle/working/Test-set-AImprove-2.csv', index=False)