In [None]:
import pandas as pd
import numpy as np
import optuna
import seaborn as sns
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split, KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import MinMaxScaler


# Load dataset
df = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')

# Preprocess the dataset
df['datetime'] = pd.to_datetime(df['datetime'])

# Extract time-related features
df['hour'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.day
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year

# Interaction features
df['hour_workingday'] = df['hour'] * df['workingday']
df['hour_temp'] = df['hour'] * df['temp']
df['hour_humidity'] = df['hour'] * df['humidity']

# One-hot encoding for categorical features
df = pd.get_dummies(df, columns=['season', 'weather', 'year'], drop_first=True)

# Drop unnecessary columns
X = df.drop(['datetime', 'count', 'casual', 'registered'], axis=1)
y = np.log1p(df['count'])

# Define numeric columns for scaling
numeric_columns = ['temp', 'atemp', 'humidity', 'windspeed', 'hour', 'day', 'day_of_week', 'month', 'hour_workingday', 'hour_temp', 'hour_humidity']

# Scale numeric columns
scaler = MinMaxScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

n_folds = 5


# Define objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 11, 2),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7, 2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.1),
        'gamma': trial.suggest_float('gamma', 0, 0.2, step=0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.1, step=0.01),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 2.0, step=0.1)
    }
    
    xgb_model = XGBRegressor(**params, objective='reg:squarederror', n_jobs=-1, random_state=42)
    
    # Initialize KFold cross-validation
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    # Perform KFold cross-validation
    rmsle_scores = []
    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # Fit the model on the training fold
        xgb_model.fit(X_train_fold, y_train_fold)

        # Make predictions on the validation fold
        y_val_pred = xgb_model.predict(X_val_fold)

        # Revert logarithmic transformation
        y_val_exp = np.expm1(y_val_fold)
        y_val_pred_exp = np.expm1(y_val_pred)

        # Calculate the RMSLE on validation fold
        rmsle = np.sqrt(mean_squared_log_error(y_val_exp, y_val_pred_exp))
        rmsle_scores.append(rmsle)

    # Calculate mean RMSLE across all folds
    mean_rmsle = np.mean(rmsle_scores)

    return mean_rmsle

# Initialize Optuna study
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=100)

# Get best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)

# Train the model with the best hyperparameters
best_xgb_model = XGBRegressor(**best_params, objective='reg:squarederror', n_jobs=-1, random_state=42)
best_xgb_model.fit(X_train, y_train)

# Calculate the feature importances
importances = best_xgb_model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances_sorted = feature_importances.sort_values(ascending=False)

# Print sorted feature importances
print("Feature importances:")
print(feature_importances_sorted)

# Print best RMSLE score
best_rmsle = study.best_value
print("Best Root Mean Squared Logarithmic Error ({}-fold CV): {:.5f}".format(n_folds, best_rmsle))


[32m[I 2023-05-02 02:08:01,102][0m A new study created in memory with name: no-name-1800b9af-11bf-40c4-9905-a8820adce7af[0m
[32m[I 2023-05-02 02:08:23,172][0m Trial 0 finished with value: 0.28059851930660695 and parameters: {'n_estimators': 400, 'learning_rate': 0.07969454818643935, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.6, 'colsample_bytree': 0.6, 'gamma': 0.0, 'reg_alpha': 0.09, 'reg_lambda': 1.3000000000000003}. Best is trial 0 with value: 0.28059851930660695.[0m
[32m[I 2023-05-02 02:09:03,264][0m Trial 1 finished with value: 1.8623953135328049 and parameters: {'n_estimators': 800, 'learning_rate': 0.0010994335574766201, 'max_depth': 11, 'min_child_weight': 7, 'subsample': 0.7, 'colsample_bytree': 0.6, 'gamma': 0.0, 'reg_alpha': 0.03, 'reg_lambda': 1.1}. Best is trial 0 with value: 0.28059851930660695.[0m
[32m[I 2023-05-02 02:09:33,712][0m Trial 2 finished with value: 0.7325180313845119 and parameters: {'n_estimators': 500, 'learning_rate': 0.00382347522467

In [None]:
# Load the test dataset
test_df = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

# Preprocess the test dataset
test_df['datetime'] = pd.to_datetime(test_df['datetime'])

# Pull out time-related features
test_df['hour'] = test_df['datetime'].dt.hour
test_df['day'] = test_df['datetime'].dt.day
test_df['day_of_week'] = test_df['datetime'].dt.dayofweek
test_df['month'] = test_df['datetime'].dt.month
test_df['year'] = test_df['datetime'].dt.year

# Interaction features
test_df['hour_workingday'] = test_df['hour'] * test_df['workingday']
test_df['hour_temp'] = test_df['hour'] * test_df['temp']
test_df['hour_humidity'] = test_df['hour'] * test_df['humidity']

# One-hot encoding for categorical features
test_df = pd.get_dummies(test_df, columns=['season', 'weather', 'year'], drop_first=True)

# Drop unnecessary columns
X_test = test_df.drop(['datetime'], axis=1)

# Apply MinMax scaling to the numeric features in the test dataset
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

# Make predictions on the test dataset using the best XGBoost model from Optuna
y_test_pred = best_xgb_model.predict(X_test)

# Revert the logarithmic transformation
y_test_pred_exp = np.expm1(y_test_pred)

# Create a submission dataframe
submission = pd.DataFrame({
    'datetime': test_df['datetime'],
    'count': y_test_pred_exp
})

# Save submission dataframe to a CSV file
submission.to_csv('submission.csv', index=False)

In [None]:
# Feature Importance Bar Graph
import matplotlib.pyplot as plt

feature_importances_sorted.plot(kind='bar')
plt.title('Feature Importances')
plt.ylabel('Importance')
plt.show()


In [None]:
# Discribution of Errors bar graph
min_length = min(len(y_test_exp), len(y_test_pred_exp))
y_test_exp = y_test_exp[:min_length]
y_test_pred_exp = y_test_pred_exp[:min_length]

error = y_test_exp - y_test_pred_exp
plt.figure(figsize=(8, 6))
sns.histplot(error, bins=50, kde=True)
plt.xlabel('Error')
plt.ylabel('Frequency')
plt.title('Distribution of Errors')
plt.show()