In [None]:
pip install geopy

# Загрузка библиотек

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans
from geopy.distance import geodesic # геодезическое расстояние между точками по поверхности Земли
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
%matplotlib inline

import warnings                                  # `do not disturbe` mode
warnings.filterwarnings('ignore')

# Загрузка данных

In [2]:
train = pd.read_csv("train.csv.zip", compression="zip")
test = pd.read_csv("test.csv")

In [3]:
train = train[:1000000]

# Очистка данных

In [4]:
pd.options.display.float_format = '{:.2f}'.format # отображение float

In [None]:
def box_plot(df):
    numerical_features=list(df.columns)
    numerical_features.remove('pickup_datetime')
    numerical_features.remove('key')
    print(numerical_features)
    print(df[numerical_features].describe())
    df[numerical_features].plot(kind='box', subplots=True, layout=(len(numerical_features) // 3 + 1, 3), figsize=(10, 10), sharex=False, sharey=False)

In [None]:
box_plot(train)

In [None]:
box_plot(test)

In [5]:
def clean_df(df):
    criteria = (
        " 0 < fare_amount <= 500"
        " and 0 < passenger_count <= 6 "
        " and -75 <= pickup_longitude <= -72 "
        " and -75 <= dropoff_longitude <= -72 "
        " and 40 <= pickup_latitude <= 42 "
        " and 40 <= dropoff_latitude <= 42 "
    )
    df = (df
          .dropna()
          .query(criteria)
          .reset_index()       
         )
    return df

In [6]:
train = clean_df(train).drop(columns = ['index'])

In [None]:
box_plot(train)

# Временные признаки

In [7]:
def add_time_features(df):
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

    # Бинарные признаки
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)  # Суббота и воскресенье
    df['is_holiday'] = df['pickup_datetime'].dt.strftime('%m-%d').isin(['01-01', '07-04', '12-25']).astype(int)  # Пример праздников

    # Циклические признаки
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    df.drop('pickup_datetime', axis=1, inplace=True)

    return df

# Географические признаки

In [None]:
from matplotlib.colors import LinearSegmentedColormap

In [None]:
#поиск значимыых точек
plt.figure(figsize=(10,10))

cmap = LinearSegmentedColormap.from_list(name='name', colors=['green','yellow','red'])

f, ax = plt.subplots()
points = ax.scatter(train['dropoff_longitude']-train['pickup_longitude'], train['dropoff_latitude']-train['pickup_latitude'], c=train['fare_amount'],
                    s=10, cmap=cmap)
f.colorbar(points)

In [8]:
def distance(x1,y1,x2,y2):
    return  geodesic((x1,y1),(x2,y2)).km

In [9]:
def add_geo_features(df):
    # значимые места
    nyc=(40.724944, -74.001541)

    # расстояние между посадкой и высадкой
    df['distance'] = df.apply(
        lambda row: distance(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']),
        axis=1
    )
    # расстояние между точками
    df['abs_long_diff'] = np.abs(df.dropoff_longitude - df.pickup_longitude)
    df['abs_lat_diff'] = np.abs(df.dropoff_latitude - df.pickup_latitude)

    # Расстояние от точки посадки и высадки до Нью-Йорка
    df['pickup_distance_to_nyc'] = df.apply(
        lambda row: distance(nyc[0], nyc[1], row['pickup_latitude'], row['pickup_longitude']),
        axis=1
    )
    df['dropoff_distance_to_nyc'] = df.apply(
        lambda row: distance(nyc[0], nyc[1], row['dropoff_latitude'], row['dropoff_longitude']),
        axis=1
    )

    # Кластеризация K-средних
    kmeans_dropoff = KMeans(n_clusters=2)
    df['cluster_dropoff'] = kmeans_dropoff.fit_predict(df[['dropoff_latitude', 'dropoff_longitude']])

    kmeans_pickup = KMeans(n_clusters=2)
    df['cluster_pickup'] = kmeans_pickup.fit_predict(df[['pickup_latitude', 'pickup_longitude']])

    # Признаки синус-косинус для направления
    df['dr_lat_sin'] = np.sin(np.radians(df['dropoff_latitude']))
    df['dr_lat_cos'] = np.cos(np.radians(df['dropoff_latitude']))
    df['dr_lon_sin'] = np.sin(np.radians(df['dropoff_longitude']))
    df['dr_lon_cos'] = np.cos(np.radians(df['dropoff_longitude']))

    df['pi_lat_sin'] = np.sin(np.radians(df['pickup_latitude']))
    df['pi_lat_cos'] = np.cos(np.radians(df['pickup_latitude']))
    df['pi_lon_sin'] = np.sin(np.radians(df['pickup_longitude']))
    df['pi_lon_cos'] = np.cos(np.radians(df['pickup_longitude']))

    return df

# Подготовка датасетов

In [10]:
def final_desc(df):
    print(df)

    numerical_features=list(df.columns)
    numerical_features.remove('pickup_datetime')
    numerical_features.remove('key')

    print(numerical_features)

    print(df[numerical_features].describe())

    print(df.isna().sum().sum())
    
    df[numerical_features].hist(bins=50, figsize=(10, 10))

In [11]:
train = add_time_features(train)
test = add_time_features(test)

In [12]:
train = add_geo_features(train)
test = add_geo_features(test)

In [None]:
train

# Разделение данных

In [13]:
train.drop(['key'], axis=1, inplace=True)

In [14]:
x = train.drop(['fare_amount'], axis=1)
y = train['fare_amount']
#del(train)

In [15]:
x_train, x_val, y_train, y_val = train_test_split(x, y, random_state=42, test_size=0.15)
del(x)
del(y)

# Создание и обучение модели

In [21]:
model = CatBoostRegressor(
    iterations=10000,
    learning_rate=0.05,
    depth=10,
    loss_function='RMSE',
    eval_metric='RMSE',
    task_type='GPU',
    random_seed=42,
    verbose=200
)

In [22]:
model.fit(x_train, y_train, eval_set=(x_val, y_val))

0:	learn: 9.3902261	test: 9.3717227	best: 9.3717227 (0)	total: 9.48ms	remaining: 1m 34s
200:	learn: 3.7194956	test: 3.9942561	best: 3.9942561 (200)	total: 1.74s	remaining: 1m 25s
400:	learn: 3.4564245	test: 3.9228298	best: 3.9228298 (400)	total: 3.51s	remaining: 1m 24s
600:	learn: 3.2912390	test: 3.8970534	best: 3.8969326 (599)	total: 5.26s	remaining: 1m 22s
800:	learn: 3.1727455	test: 3.8838791	best: 3.8838791 (800)	total: 7.02s	remaining: 1m 20s
1000:	learn: 3.0780370	test: 3.8788551	best: 3.8781526 (970)	total: 8.79s	remaining: 1m 19s
1200:	learn: 2.9989812	test: 3.8718285	best: 3.8716877 (1195)	total: 10.6s	remaining: 1m 17s
1400:	learn: 2.9334308	test: 3.8675245	best: 3.8675168 (1397)	total: 12.3s	remaining: 1m 15s
1600:	learn: 2.8733150	test: 3.8658376	best: 3.8656445 (1550)	total: 14.1s	remaining: 1m 13s
1800:	learn: 2.8226328	test: 3.8649811	best: 3.8648202 (1796)	total: 15.8s	remaining: 1m 11s
2000:	learn: 2.7759304	test: 3.8637500	best: 3.8637412 (1990)	total: 17.6s	remaining

<catboost.core.CatBoostRegressor at 0x22c3de2de10>

# Предсказание

In [23]:
test_key = test['key']
x_pred = test.drop(columns=['key'], axis=1)

In [24]:
prediction = model.predict(x_pred)

In [25]:
submission = pd.DataFrame({
    'key': test['key'],
    'fare_amount': prediction,
}, columns=['key', 'fare_amount'])

In [26]:
submission.to_csv('submission.csv', index=False)