In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Загрузка данных и сэмплирование
df = pd.read_csv('../data/trips.csv', encoding="utf_16_le")
df = df.sample(frac=0.2, random_state=42).copy()

In [None]:
# Преобразование временных меток
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

# Продолжительность поездки в минутах
df['duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60

# Фильтр аномалий (поездки с отрицательной длительностью)
df = df[df['duration'] > 0]

# Извлекаем час и день недели
df['hour'] = df['started_at'].dt.hour
df['day_of_week'] = df['started_at'].dt.dayofweek.map({
    0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'
})

# Удаляем ненужные параметры
df.drop(["started_at", "ended_at", "start_station_name", "end_station_name", "start_lat", "start_lng", "end_lat", "end_lng"], axis=1, inplace=True)
# df.drop(["started_at", "ended_at"], axis=1, inplace=True)

# Кодирование категориальных признаков
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

# Целевая переменная
y = df['member_casual']

# Признаки
X = df[['duration', 'rideable_type', 'hour', 'day_of_week']]

# Нормализация
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [4]:
# Персептрон
perceptron = Perceptron(max_iter=1000, tol=1e-3)
perceptron.fit(X_train, y_train)
y_pred_p = perceptron.predict(X_test)

# Случайный лес
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [5]:
acc_p = accuracy_score(y_test, y_pred_p)
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

print(f'Персептрон: {acc_p:.4f}')
print(f'Случайный лес: {acc_rf:.4f}')
print(f'XGBoost: {acc_xgb:.4f}')

Персептрон: 0.8222
Случайный лес: 0.7676
XGBoost: 0.8228
