# Дивизия Bike Sharing: Сравнение моделей

Цель: сравнить точность Random Forest, XGBoost и RNN на задаче определения типа пользователя (member/casual).

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

In [9]:
# Загрузка данных
data = pd.read_csv('../data/202401-divvy-tripdata.csv', low_memory=False)

# Выберем сэмпл для ускорения работы
sample_data = data.sample(frac=0.1, random_state=42)

In [10]:
# Предобработка данных
# Преобразуем время начала поездки
sample_data['started_at'] = pd.to_datetime(sample_data['started_at'])
sample_data['hour'] = sample_data['started_at'].dt.hour
sample_data['day_of_week'] = sample_data['started_at'].dt.dayofweek

# Длительность поездки
sample_data['ended_at'] = pd.to_datetime(sample_data['ended_at'])
sample_data['duration_min'] = (sample_data['ended_at'] - sample_data['started_at']).dt.total_seconds() / 60

# Категориальные признаки
le_bike = LabelEncoder()
sample_data['bike_type_encoded'] = le_bike.fit_transform(sample_data['rideable_type'])

le_start = LabelEncoder()
sample_data['start_station_encoded'] = le_start.fit_transform(sample_data['start_station_name'].astype(str))

# Целевая переменная
y = sample_data['member_casual'].map({'member': 0, 'casual': 1})

# Фичи
X = sample_data[['hour', 'day_of_week', 'duration_min', 'bike_type_encoded', 'start_station_encoded']]

In [11]:
# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Нормализация
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Для RNN нужно добавить временную ось (samples, timesteps, features)
X_train_rnn = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_rnn = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

In [12]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print(f'Accuracy (Random Forest): {acc_rf:.4f}')

Accuracy (Random Forest): 0.8109


In [13]:
# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'Accuracy (XGBoost): {acc_xgb:.4f}')

Accuracy (XGBoost): 0.8168


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [14]:
# RNN
model = Sequential([
    SimpleRNN(64, input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Обучение
model.fit(X_train_rnn, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=1)

# Оценка
loss, acc_rnn = model.evaluate(X_test_rnn, y_test, verbose=0)
print(f'Accuracy (RNN): {acc_rnn:.4f}')

Epoch 1/5


  super().__init__(**kwargs)


[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7722 - loss: 0.5262 - val_accuracy: 0.8387 - val_loss: 0.4387
Epoch 2/5
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8323 - loss: 0.4481 - val_accuracy: 0.8387 - val_loss: 0.4380
Epoch 3/5
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8314 - loss: 0.4505 - val_accuracy: 0.8387 - val_loss: 0.4364
Epoch 4/5
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8322 - loss: 0.4461 - val_accuracy: 0.8387 - val_loss: 0.4341
Epoch 5/5
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8330 - loss: 0.4444 - val_accuracy: 0.8387 - val_loss: 0.4337
Accuracy (RNN): 0.8237


In [15]:
# Вывод результатов
results = {
    'Random Forest': acc_rf,
    'XGBoost': acc_xgb,
    'RNN': acc_rnn
}

print('\nСравнение моделей:')
for model_name, acc in results.items():
    print(f'{model_name}: {acc:.4f}')


Сравнение моделей:
Random Forest: 0.8109
XGBoost: 0.8168
RNN: 0.8237
