In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
#import csv file to pandas dataframe
import pandas as pd
bike=pd.read_csv('SeoulBikeData.csv',encoding='latin1')

In [None]:
bike.head()

In [None]:
X = bike.drop(columns=['Rented Bike Count'])
y = bike['Rented Bike Count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine X_train and y_train to have the complete training dataset
train_data = X_train.copy()
train_data['Rented Bike Count'] = y_train

# Visualize numerical columns in the training set
plt.figure(figsize=(20, 15))
numerical_columns = train_data.select_dtypes(include=['number']).columns

for i, column in enumerate(numerical_columns, 1):
    plt.subplot(len(numerical_columns) // 3 + 1, 3, i)
    sns.histplot(train_data[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()




In [None]:
# Visualize categorical columns in the training set
plt.figure(figsize=(20, 10))
categorical_columns = train_data.select_dtypes(include=['object']).columns

# Define the categorical columns to visualize
categorical_columns = ["Seasons", "Holiday", "Functioning Day"]

plt.figure(figsize=(15, 5))

for i, column in enumerate(categorical_columns, 1):
    plt.subplot(1, 3, i)
    sns.countplot(y=train_data[column], order=train_data[column].value_counts().index)
    plt.title(f'Counts for {column}')
    plt.xlabel('Count')
    plt.ylabel(column)

plt.tight_layout()
plt.show()

In [None]:
correlation_matrix = train_data.select_dtypes(include=['number']).corr()

plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix for Numerical Columns')
plt.savefig('corr.png')
plt.show()

In [None]:
average_bike_count = train_data.groupby(['Holiday', 'Functioning Day', 'Seasons'])['Rented Bike Count'].mean().reset_index()
plt.figure(figsize=(14, 8))
sns.barplot(data=average_bike_count, x='Seasons', y='Rented Bike Count', hue='Holiday', ci=None, palette='viridis')
plt.title('Average Rented Bike Count by Seasons and Holiday')
plt.xlabel('Seasons')
plt.ylabel('Average Rented Bike Count')
plt.legend(title='Holiday')
plt.show()

In [None]:
#average rented bike count by Functioning day or not
average_bike_count_functioning = train_data.groupby('Functioning Day').agg({
    'Rented Bike Count': ['mean', 'count']
}).reset_index()

average_bike_count_functioning.columns = ['Functioning Day', 'Average Rented Bike Count', 'Number of Instances']

print(average_bike_count_functioning)

In [None]:
#average rented bike count by Holiday or not
average_bike_count_holiday = train_data.groupby('Holiday').agg({
    'Rented Bike Count': ['mean', 'count']
}).reset_index()

average_bike_count_holiday.columns = ['Holiday', 'Average Rented Bike Count', 'Number of Instances']

print(average_bike_count_holiday)

In [None]:
# Calculate the average 'Rented Bike Count' per hour
average_hourly_rented_bike_count = train_data.groupby('Hour')['Rented Bike Count'].mean()

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(average_hourly_rented_bike_count.index, average_hourly_rented_bike_count.values, marker='o')
plt.title('Average Hourly Rented Bike Count')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Rented Bike Count')
plt.xticks(range(24))
plt.grid(True)
plt.show()


In [None]:
train_data['Date'] = pd.to_datetime(train_data['Date'])

# Extract 'Year' and 'WeekDay'
train_data['Year'] = train_data['Date'].dt.year
train_data['WeekDay'] = train_data['Date'].dt.day_name()
# Convert 'Date' column to datetime in X_test
X_test['Date'] = pd.to_datetime(X_test['Date'], format='%d/%m/%Y')
X_test['Year'] = X_test['Date'].dt.year
X_test['WeekDay'] = X_test['Date'].dt.day_name()

#Drop date
train_data=train_data.drop(columns=['Date'])
X_test=X_test.drop(columns=['Date'])

# Map 'WeekDay' to numerical encoding
mapping_dictDay = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
train_data['WeekDay'] = train_data['WeekDay'].map(mapping_dictDay)
X_test['WeekDay'] = X_test['WeekDay'].map(mapping_dictDay)


In [None]:
# Group by 'Year' and calculate both average and count of 'Rented Bike Count'
average_and_counts_per_year = train_data.groupby('Year')['Rented Bike Count'].agg(['mean', 'count'])

print(average_and_counts_per_year)


In [None]:
# Define a function to compute the mean of the middle 50% values
def mean_of_middle_50_percent(group):
    # Sort the values
    sorted_values = group.sort_values()

    # Calculate the 25th and 75th percentiles
    lower_bound = sorted_values.quantile(0.25)
    upper_bound = sorted_values.quantile(0.75)

    # Filter the values to get the middle 50%
    middle_50_values = sorted_values[(sorted_values >= lower_bound) & (sorted_values <= upper_bound)]

    # Compute the mean of the middle 50% values
    return middle_50_values.mean()

# Group by 'Year' and apply the function
mean_middle_50_per_year = train_data.groupby('Year')['Rented Bike Count'].apply(mean_of_middle_50_percent)

# Print the results
print(mean_middle_50_per_year)

In [None]:
# Group by 'WeekDay' and calculate both average and count of 'Rented Bike Count'
average_and_counts_by_weekday = train_data.groupby('WeekDay')['Rented Bike Count'].agg(['mean', 'count'])

print(average_and_counts_by_weekday)


In [None]:
# Label Encode 'Holiday' and 'Functioning Day' columns
train_data['Holiday'] = train_data['Holiday'].map({'Holiday': 0, 'No Holiday': 1})
train_data['Functioning Day'] = train_data['Functioning Day'].map({'Yes': 1, 'No': 0})

X_test['Holiday']=X_test['Holiday'].map({'Holiday': 0, 'No Holiday': 1})
X_test['Functioning Day'] = X_test['Functioning Day'].map({'Yes': 1, 'No': 0})

# Label Encode Seasons Column
train_data['Seasons'] = train_data['Seasons'].map({
    'Autumn': 0,
    'Spring': 1,
    'Summer': 2,
    'Winter': 3
}
)
X_test['Seasons'] = X_test['Seasons'].map({
    'Autumn': 0,
    'Spring': 1,
    'Summer': 2,
    'Winter': 3
}
)


In [None]:
train_data.head()

In [None]:
# Create and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100,bootstrap=True)
rf_model.fit(train_data.drop(columns=['Functioning Day','Rented Bike Count']), train_data['Rented Bike Count'])

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test.drop(columns=['Functioning Day']))

# Modify predictions to be 0 when 'Functioning Day' is 0
functioning_day_column = X_test['Functioning Day']
y_pred_rf[functioning_day_column == 0] = 0

rf_rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
print(f'Random Forest RMSE: {rf_rmse}')

rf_r2 = r2_score(y_test, y_pred_rf)
print(f'Random Forest R²: {rf_r2}')

mae = mean_absolute_error(y_test, y_pred_rf)
print(f"Random Forest Mean Absolute Error: {mae:.2f}")

In [None]:
#train data performance

y_pred_rf = rf_model.predict(train_data.drop(columns=['Functioning Day','Rented Bike Count']))

# Modify predictions to be 0 when 'Functioning Day' is 0
functioning_day_column = train_data['Functioning Day']
y_pred_rf[functioning_day_column == 0] = 0

rf_rmse = mean_squared_error(train_data['Rented Bike Count'], y_pred_rf, squared=False)
print(f'Random Forest RMSE: {rf_rmse}')

rf_r2 = r2_score(train_data['Rented Bike Count'], y_pred_rf)
print(f'Random Forest R²: {rf_r2}')

mae = mean_absolute_error(train_data['Rented Bike Count'], y_pred_rf)
print(f"Random Forest Mean Absolute Error: {mae:.2f}")

In [None]:
# Print two prediction examples for the test dataset
for i in range(2):
    print(f"Test example {i + 1}:")
    print(f"Input features:\n{X_test.iloc[i].to_dict()}")
    print(f"Actual: {y_test.iloc[i]}")
    print(f"Predicted: {y_pred_rf[i]}\n")



In [None]:
# Synthesized data
synthesized_data = pd.DataFrame({
    'Hour': [10, 14],
    'Temperature(°C)': [25, 30],
    'Humidity(%)': [40, 50],
    'Wind speed (m/s)': [2.0, 3.5],
    'Visibility (10m)': [2000, 1500],
    'Dew point temperature(°C)': [15, 20],
    'Solar Radiation (MJ/m2)': [0.5, 0.1],
    'Rainfall(mm)': [0.0, 0.1],
    'Snowfall (cm)': [0.0, 0.1],
    'Seasons':[0,3],
    'Holiday':[1,1],
    'Functioning Day': [1, 1],
    'Year':[2017,2017],
    'WeekDay':[1,7]

    # Add other necessary features with appropriate values
})

# Make predictions on the synthesized data
synthesized_predictions = rf_model.predict(synthesized_data.drop(columns=['Functioning Day']))

# Modify synthesized predictions to be 0 when 'Functioning Day' is 0
synthesized_predictions[synthesized_data['Functioning Day'] == 0] = 0

# Print synthesized data predictions
for i in range(len(synthesized_data)):
    print(f"Synthesized example {i + 1}:")
    print(f"Input features:\n{synthesized_data.iloc[i].to_dict()}")
    print(f"Predicted: {synthesized_predictions[i]}\n")