In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

In [2]:
# Загрузка данных
df = pd.read_csv('../data/trips.csv', encoding="utf_16_le")
df = df.sample(frac=0.2, random_state=42).copy()

In [3]:
# Преобразование временных меток
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

# Продолжительность поездки в минутах
df['duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60

# Фильтр аномалий (поездки с отрицательной длительностью)
df = df[df['duration'] > 0]

# Извлекаем час и день недели
df['hour'] = df['started_at'].dt.hour
df['day_of_week'] = df['started_at'].dt.dayofweek.map({
    0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'
})

# Удаляем ненужные параметры
df.drop(["started_at", "ended_at", "start_station_name", "end_station_name", "start_lat", "start_lng", "end_lat", "end_lng"], axis=1, inplace=True)
# df.drop(["started_at", "ended_at"], axis=1, inplace=True)

# Кодирование категориальных признаков
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

# Целевая переменная
y = df['member_casual']

# Признаки
X = df[['duration', 'rideable_type', 'hour', 'day_of_week']]

# Нормализация
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [4]:
# Для RNN нужно добавить временную ось (samples, timesteps, features)
X_train_rnn = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_rnn = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [5]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print(f'Accuracy (Random Forest): {acc_rf:.4f}')

Accuracy (Random Forest): 0.7681


In [6]:
# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'Accuracy (XGBoost): {acc_xgb:.4f}')

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy (XGBoost): 0.8270


In [9]:
# RNN
model = Sequential([
    SimpleRNN(64, input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Обучение
model.fit(X_train_rnn, y_train, epochs=15, batch_size=32, validation_split=0.2, verbose=1)

# Оценка
loss, acc_rnn = model.evaluate(X_test_rnn, y_test, verbose=0)
print(f'Accuracy (RNN): {acc_rnn:.4f}')

Epoch 1/15
[1m579/579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8254 - loss: 0.4753 - val_accuracy: 0.8353 - val_loss: 0.4437
Epoch 2/15
[1m579/579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8292 - loss: 0.4482 - val_accuracy: 0.8355 - val_loss: 0.4393
Epoch 3/15
[1m579/579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8352 - loss: 0.4394 - val_accuracy: 0.8353 - val_loss: 0.4424
Epoch 4/15
[1m579/579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8225 - loss: 0.4571 - val_accuracy: 0.8355 - val_loss: 0.4387
Epoch 5/15
[1m579/579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8274 - loss: 0.4491 - val_accuracy: 0.8355 - val_loss: 0.4396
Epoch 6/15
[1m579/579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8320 - loss: 0.4412 - val_accuracy: 0.8355 - val_loss: 0.4389
Epoch 7/15
[1m579/579[0m 

In [10]:
# Вывод результатов
results = {
    'Random Forest': acc_rf,
    'XGBoost': acc_xgb,
    'RNN': acc_rnn
}

print('\nСравнение моделей:')
for model_name, acc in results.items():
    print(f'{model_name}: {acc:.4f}')


Сравнение моделей:
Random Forest: 0.7681
XGBoost: 0.8270
RNN: 0.8297
