In [30]:
import pandas as pd

# Load the training dataset
data_url = "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bikes.csv"
data = pd.read_csv(data_url)

# Load the mini holdout dataset
mini_holdout_url = "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/biking_holdout_test_mini.csv"
mini_holdout = pd.read_csv(mini_holdout_url)

# Inspect the datasets
print("Training Dataset:")
print(data.head())
print("\nMini Holdout Dataset:")
print(mini_holdout.head())

# Check column names
print("\nColumns in Training Dataset:", data.columns)
print("Columns in Mini Holdout Dataset:", mini_holdout.columns)

Training Dataset:
     dteday   hr  casual  registered  temp_c  feels_like_c     hum  windspeed  \
0  1/1/2011  0.0       3          13     3.0           3.0  0.7957        0.8   
1  1/1/2011  1.0       8          30     1.7           1.7  0.8272        0.8   
2  1/1/2011  2.0       5          26     1.9           1.9  0.8157        1.1   
3  1/1/2011  3.0       3           9     2.5           2.5  0.7831        0.8   
4  1/1/2011  4.0       0           1     2.0           2.0  0.8075        1.1   

   weathersit  season  holiday  workingday  
0           1       1        0           0  
1           1       1        0           0  
2           1       1        0           0  
3           1       1        0           0  
4           1       1        0           0  

Mini Holdout Dataset:
       dteday   hr  temp_c  feels_like_c     hum  windspeed  weathersit  \
0  11/15/2023  0.0     7.3           7.3  0.6667        0.0           1   
1  11/15/2023  1.0     6.2           6.2  0.7406    

In [31]:
# Add the 'count' column to the training dataset
data['count'] = data['casual'] + data['registered']

# Drop the 'casual' and 'registered' columns since they are no longer needed
data = data.drop(columns=['casual', 'registered'])

# Verify the new column
print("\nTraining Dataset with 'count':")
print(data.head())


Training Dataset with 'count':
     dteday   hr  temp_c  feels_like_c     hum  windspeed  weathersit  season  \
0  1/1/2011  0.0     3.0           3.0  0.7957        0.8           1       1   
1  1/1/2011  1.0     1.7           1.7  0.8272        0.8           1       1   
2  1/1/2011  2.0     1.9           1.9  0.8157        1.1           1       1   
3  1/1/2011  3.0     2.5           2.5  0.7831        0.8           1       1   
4  1/1/2011  4.0     2.0           2.0  0.8075        1.1           1       1   

   holiday  workingday  count  
0        0           0     16  
1        0           0     38  
2        0           0     31  
3        0           0     12  
4        0           0      1  


In [32]:
# Convert 'dteday' to datetime format
data['dteday'] = pd.to_datetime(data['dteday'])

# Extract temporal features
data['month'] = data['dteday'].dt.month
data['day_of_week'] = data['dteday'].dt.dayofweek
data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)

# Drop 'dteday' after extracting features
data = data.drop(columns=['dteday'])

In [33]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical features
categorical_features = ['season', 'weathersit']

# Initialize the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform the categorical features
encoded_features = encoder.fit_transform(data[categorical_features])

# Convert to DataFrame and append to the dataset
encoded_columns = encoder.get_feature_names_out(categorical_features)
encoded_df = pd.DataFrame(encoded_features, columns=encoded_columns)
data = pd.concat([data.drop(columns=categorical_features), encoded_df], axis=1)

In [34]:
from sklearn.preprocessing import StandardScaler

# Identify numerical features
numerical_features = ['hr', 'temp_c', 'feels_like_c', 'hum', 'windspeed', 'month', 'day_of_week', 'is_weekend']

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numerical features
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [35]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = data.drop(columns=['count'], errors='ignore')  # Features (all columns except 'count')
y = data['count']  # Target variable

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the shapes of the splits
print("\nX_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)


X_train shape: (89980, 18)
X_val shape: (22495, 18)
y_train shape: (89980,)
y_val shape: (22495,)


In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Build the neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    verbose=1
)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 113215.6016 - mae: 231.7284 - val_loss: 73636.0781 - val_mae: 192.9909
Epoch 2/50
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 77699.9844 - mae: 199.8164 - val_loss: 71885.7969 - val_mae: 191.7579
Epoch 3/50
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 71533.8203 - mae: 187.9653 - val_loss: 54286.4648 - val_mae: 156.8385
Epoch 4/50
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 59421.4766 - mae: 167.9650 - val_loss: 48963.0117 - val_mae: 141.7675
Epoch 5/50
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 56198.6836 - mae: 162.9470 - val_loss: 46102.5352 - val_mae: 133.1557
Epoch 6/50
[1m2812/2812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 53646.4492 - mae: 158.6769 - val_loss: 43928.5742 - val_mae: 132.0983
Epoch 7/50


In [37]:
# Convert 'dteday' to datetime format
mini_holdout['dteday'] = pd.to_datetime(mini_holdout['dteday'])

# Extract temporal features
mini_holdout['month'] = mini_holdout['dteday'].dt.month
mini_holdout['day_of_week'] = mini_holdout['dteday'].dt.dayofweek
mini_holdout['is_weekend'] = mini_holdout['day_of_week'].isin([5, 6]).astype(int)

# Drop 'dteday' after extracting features
mini_holdout = mini_holdout.drop(columns=['dteday'])

# One-hot encode categorical features
encoded_features_holdout = encoder.transform(mini_holdout[categorical_features])
encoded_df_holdout = pd.DataFrame(encoded_features_holdout, columns=encoded_columns)
mini_holdout = pd.concat([mini_holdout.drop(columns=categorical_features), encoded_df_holdout], axis=1)

# Normalize numerical features
mini_holdout[numerical_features] = scaler.transform(mini_holdout[numerical_features])

In [38]:
# Generate predictions
predicted_counts = model.predict(mini_holdout).flatten()
predicted_counts = predicted_counts.round(0).astype(int)

# Save predictions to a CSV file
results = pd.DataFrame({'count': predicted_counts})
results.to_csv("team8-bike-rental-predictions.csv", index=False)

print("\nPredictions saved to team8-bike-rental-predictions.csv")

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

Predictions saved to team8-bike-rental-predictions.csv
