In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# Load the dataset
data = pd.read_csv('/mnt/data/day.csv')

# Display the first few rows of the dataset
data.head()

In [None]:
# Data Quality Checks
print(data.info())
print(data.describe())
print(data.isnull().sum())
print(data.duplicated().sum())

# Handle duplicates
data.drop_duplicates(inplace=True)

In [None]:
# Convert numeric categories to strings based on the data dictionary
data['season'] = data['season'].map({1: 'spring', 2: 'summer', 3: 'fall', 4: 'winter'})
data['weathersit'] = data['weathersit'].map({1: 'clear', 2: 'mist', 3: 'light_rain', 4: 'heavy_rain'})

In [None]:
# Exploratory Data Analysis (EDA)
sns.histplot(data['cnt'])
plt.title('Distribution of Total Bike Rentals')
plt.show()

sns.boxplot(data['cnt'])
plt.title('Boxplot of Total Bike Rentals')
plt.show()

sns.pairplot(data[['temp', 'atemp', 'hum', 'windspeed', 'cnt']])
plt.show()

In [None]:
# Create Dummy Variables
data = pd.get_dummies(data, columns=['season', 'weathersit'], drop_first=True)

In [None]:
# Prepare Data for Modeling
X = data.drop(['dteday', 'instant', 'casual', 'registered', 'cnt'], axis=1)
y = data['cnt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Build the Multiple Linear Regression Model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_test_pred = model.predict(X_test_scaled)

# Calculate R-squared score on the test set
test_r2 = r2_score(y_test, y_test_pred)
print("Test R-squared:", test_r2)

# Calculate RMSE on the test set
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("Test RMSE:", rmse)

In [None]:
# Residual Analysis
residuals = y_test - y_test_pred
sns.histplot(residuals, kde=True)
plt.title('Residuals Distribution')
plt.show()

sns.scatterplot(x=y_test_pred, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residuals vs Predicted')
plt.show()

In [None]:
# Model Interpretation
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefficients)

# Save the model coefficients to a file
coefficients.to_csv('/mnt/data/model_coefficients.csv', index=False)