# ML Assignment 2 â€“ Bike Sharing Classification

This notebook implements **6 classification models** on the Bike Sharing dataset:
1. Logistic Regression  
2. Decision Tree Classifier  
3. K-Nearest Neighbor Classifier  
4. Naive Bayes Classifier (Gaussian)  
5. Random Forest (Ensemble)  
6. XGBoost (Ensemble)  

**Evaluation metrics:** Accuracy, AUC, Precision, Recall, F1 Score, MCC  
**Data:** `bike_train.csv` (training), `bike_test.csv` (test).

## 1. Imports and configuration

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score, f1_score,
    matthews_corrcoef, confusion_matrix, classification_report
)
import xgboost as xgb
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
MODEL_DIR = 'model'
os.makedirs(MODEL_DIR, exist_ok=True)

## 2. Load and explore data

In [None]:
train_df = pd.read_csv('bike_train.csv')
test_df = pd.read_csv('bike_test.csv')
print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)
train_df.head()

## 3. Preprocessing

Target `count` is continuous; we bin it into 4 classes for multi-class classification.  
We use at least 12 features: season, holiday, workingday, weather, temp, atemp, humidity, windspeed + year, month, day, hour from datetime.

In [None]:
def preprocess_train(df):
    df = df.copy()
    df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    # Drop datetime and leaky columns (casual + registered = count)
    df = df.drop(columns=['datetime', 'casual', 'registered'])
    # Bin count into 4 classes (quartile-based)
    df['count_class'] = pd.qcut(df['count'], q=4, labels=[0, 1, 2, 3], duplicates='drop')
    df = df.drop(columns=['count'])
    return df

def preprocess_test(df):
    df = df.copy()
    df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df = df.drop(columns=['datetime'])
    return df

train_processed = preprocess_train(train_df)
train_processed.dropna(inplace=True)
feature_cols = [c for c in train_processed.columns if c != 'count_class']
print('Feature columns (count = {}):'.format(len(feature_cols)), feature_cols)
print('Class distribution:')
print(train_processed['count_class'].value_counts().sort_index())

In [None]:
X = train_processed[feature_cols]
y = train_processed['count_class'].astype(int)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

joblib.dump(scaler, os.path.join(MODEL_DIR, 'scaler.joblib'))
joblib.dump(feature_cols, os.path.join(MODEL_DIR, 'feature_cols.joblib'))
print('Train size:', X_train.shape[0], '| Validation size:', X_val.shape[0])

## 4. Evaluation helper and model training

Train all 6 models and compute Accuracy, AUC, Precision, Recall, F1, MCC.

In [None]:
def evaluate_model(y_true, y_pred, y_proba=None, name='Model'):
    n_classes = len(np.unique(y_true))
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_true, y_pred)
    if y_proba is not None and n_classes == 2:
        auc = roc_auc_score(y_true, y_proba[:, 1])
    elif y_proba is not None and n_classes > 2:
        auc = roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted')
    else:
        auc = 0.0
    return {
        'Accuracy': acc, 'AUC': auc, 'Precision': precision,
        'Recall': recall, 'F1': f1, 'MCC': mcc
    }

results = []

In [None]:
# 1. Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_val_scaled)
y_proba_lr = lr.predict_proba(X_val_scaled)
res_lr = evaluate_model(y_val, y_pred_lr, y_proba_lr, 'Logistic Regression')
results.append(('Logistic Regression', res_lr))
joblib.dump(lr, os.path.join(MODEL_DIR, 'logistic_regression.joblib'))
print('Logistic Regression:', res_lr)

In [None]:
# 2. Decision Tree
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt.fit(X_train, y_train)  # Tree often works without scaling
y_pred_dt = dt.predict(X_val)
y_proba_dt = dt.predict_proba(X_val)
res_dt = evaluate_model(y_val, y_pred_dt, y_proba_dt, 'Decision Tree')
results.append(('Decision Tree', res_dt))
joblib.dump(dt, os.path.join(MODEL_DIR, 'decision_tree.joblib'))
print('Decision Tree:', res_dt)

In [None]:
# 3. K-Nearest Neighbor
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_val_scaled)
y_proba_knn = knn.predict_proba(X_val_scaled)
res_knn = evaluate_model(y_val, y_pred_knn, y_proba_knn, 'kNN')
results.append(('kNN', res_knn))
joblib.dump(knn, os.path.join(MODEL_DIR, 'knn.joblib'))
print('kNN:', res_knn)

In [None]:
# 4. Naive Bayes (Gaussian)
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)
y_pred_nb = nb.predict(X_val_scaled)
y_proba_nb = nb.predict_proba(X_val_scaled)
res_nb = evaluate_model(y_val, y_pred_nb, y_proba_nb, 'Naive Bayes')
results.append(('Naive Bayes', res_nb))
joblib.dump(nb, os.path.join(MODEL_DIR, 'naive_bayes.joblib'))
print('Naive Bayes:', res_nb)

In [None]:
# 5. Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf.fit(X_train, y_train)  # RF typically no scaling
y_pred_rf = rf.predict(X_val)
y_proba_rf = rf.predict_proba(X_val)
res_rf = evaluate_model(y_val, y_pred_rf, y_proba_rf, 'Random Forest')
results.append(('Random Forest (Ensemble)', res_rf))
joblib.dump(rf, os.path.join(MODEL_DIR, 'random_forest.joblib'))
print('Random Forest:', res_rf)

In [None]:
# 6. XGBoost
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=RANDOM_STATE, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)  # XGBoost typically no scaling
y_pred_xgb = xgb_model.predict(X_val)
y_proba_xgb = xgb_model.predict_proba(X_val)
res_xgb = evaluate_model(y_val, y_pred_xgb, y_proba_xgb, 'XGBoost')
results.append(('XGBoost (Ensemble)', res_xgb))
joblib.dump(xgb_model, os.path.join(MODEL_DIR, 'xgboost.joblib'))
print('XGBoost:', res_xgb)

## 5. Comparison table (all metrics)

In [None]:
comparison = pd.DataFrame(
    [dict(name=name, **r) for name, r in results]
).set_index('name')
comparison.round(4)

## 6. Confusion matrices and classification reports

In [None]:
predictions = {
    'Logistic Regression': y_pred_lr,
    'Decision Tree': y_pred_dt,
    'kNN': y_pred_knn,
    'Naive Bayes': y_pred_nb,
    'Random Forest': y_pred_rf,
    'XGBoost': y_pred_xgb
}
for name, y_pred in predictions.items():
    print('---', name, '---')
    print(confusion_matrix(y_val, y_pred))
    print(classification_report(y_val, y_pred, zero_division=0))
    print()

## 7. Predict on test set and save (optional)

Preprocess test data and generate predictions using the saved scaler and one model (e.g. Random Forest uses raw features). For Streamlit we load from `model/`.

In [None]:
# Save validation metrics and confusion matrices for Streamlit app (no need to ship train CSV)
name_to_pred = {
    'Logistic Regression': y_pred_lr,
    'Decision Tree': y_pred_dt,
    'kNN': y_pred_knn,
    'Naive Bayes': y_pred_nb,
    'Random Forest (Ensemble)': y_pred_rf,
    'XGBoost (Ensemble)': y_pred_xgb,
}
name_to_proba = {
    'Logistic Regression': y_proba_lr,
    'Decision Tree': y_proba_dt,
    'kNN': y_proba_knn,
    'Naive Bayes': y_proba_nb,
    'Random Forest (Ensemble)': y_proba_rf,
    'XGBoost (Ensemble)': y_proba_xgb,
}
val_results = {}
for name, y_pred in name_to_pred.items():
    y_proba = name_to_proba[name]
    val_results[name] = {
        'metrics': evaluate_model(y_val, y_pred, y_proba, name),
        'confusion_matrix': confusion_matrix(y_val, y_pred),
        'classification_report': classification_report(y_val, y_pred, zero_division=0),
    }
joblib.dump(val_results, os.path.join(MODEL_DIR, 'validation_results.joblib'))
print('Validation results saved to model/validation_results.joblib for Streamlit app.')

In [None]:
test_processed = preprocess_test(test_df)
test_processed = test_processed[feature_cols].fillna(test_processed[feature_cols].median())
X_test_scaled = scaler.transform(test_processed)
# Example: predictions from Random Forest (no scaling)
test_pred_rf = rf.predict(test_processed)
test_out = test_df.copy()
test_out['count_class_pred'] = test_pred_rf
test_out.to_csv('bike_test_predictions.csv', index=False)
print('Test predictions saved to bike_test_predictions.csv')