In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the data
df = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')

# Display the first few rows of the dataframe
print(df.head())

              datetime  season  holiday  workingday  weather  temp   atemp  \
0  2011-01-01 00:00:00       1        0           0        1  9.84  14.395   
1  2011-01-01 01:00:00       1        0           0        1  9.02  13.635   
2  2011-01-01 02:00:00       1        0           0        1  9.02  13.635   
3  2011-01-01 03:00:00       1        0           0        1  9.84  14.395   
4  2011-01-01 04:00:00       1        0           0        1  9.84  14.395   

   humidity  windspeed  casual  registered  count  
0        81        0.0       3          13     16  
1        80        0.0       8          32     40  
2        80        0.0       5          27     32  
3        75        0.0       3          10     13  
4        75        0.0       0           1      1  


In [3]:
# Check for missing values
print(df.isnull().sum())

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64


In [4]:
# Convert 'datetime' column to datetime format
df['datetime'] = pd.to_datetime(df['datetime'])

# Extract features from 'datetime' column
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour

In [5]:
# Drop 'datetime' column as we have extracted the necessary features
df = df.drop(['datetime'], axis=1)

In [6]:
# Drop features which will not exist in test.csv (input for submission predictions)
df = df.drop(['casual','registered'], axis=1)

In [7]:
# Split the data into training and test sets
X = df.drop(['count'], axis=1)
y = df['count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Define RMSLE function
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2)))

In [10]:
# Make a scorer for RMSLE
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [11]:
# Define the model
model = RandomForestRegressor()

In [12]:
# Define the parameters for Grid Search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
}
# Perform Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring=rmsle_scorer)
grid_search.fit(X_train, y_train)

In [13]:
# Print the best parameters and the corresponding RMSLE
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best RMSLE: {np.sqrt(-grid_search.best_score_)}')

Best parameters: {'max_depth': None, 'n_estimators': 200}
Best RMSLE: 0.5936355117945386


In [14]:
# Use the best estimator to make predictions on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

In [15]:
# Calculate the RMSLE on the test set
test_rmsle = rmsle(y_test, predictions)
print(f'Test RMSLE: {test_rmsle}')

Test RMSLE: 0.3428653215430157


In [16]:
# Load the test data
test_df = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

# Keep the original 'datetime' column
original_datetime = test_df['datetime'].copy()

# Preprocess the test data in the same way as the training data
test_df['datetime'] = pd.to_datetime(test_df['datetime'])
test_df['year'] = test_df['datetime'].dt.year
test_df['month'] = test_df['datetime'].dt.month
test_df['day'] = test_df['datetime'].dt.day
test_df['hour'] = test_df['datetime'].dt.hour
test_df = test_df.drop(['datetime'], axis=1)

# Scale the test data using the same scaler fitted on the training data
test_data = scaler.transform(test_df)

# Use the best model to make predictions on the test data
test_predictions = best_model.predict(test_data)

# Create a dataframe for the output
output = pd.DataFrame({'datetime': original_datetime, 'count': test_predictions})

# Convert the 'count' column to integers as count of rented bikes should be an integer
output['count'] = output['count'].astype(int)

# Write the output dataframe to a csv file
output.to_csv('submission.csv', index=False)
