In [3]:
pip install geopy



In [2]:
pip install ydata-profiling

Collecting ydata-profiling
  Downloading ydata_profiling-4.12.0-py2.py3-none-any.whl.metadata (20 kB)
Collecting visions<0.7.7,>=0.7.5 (from visions[type_image_path]<0.7.7,>=0.7.5->ydata-profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Downloading phik-0.12.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata-profiling)
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata-profiling)
  Downloading dacite-1.8.1-py3-none-any.whl.metadata (15 kB)
Collecting PyWavelets (from imagehash==4.3.1->ydata-profiling)
  Downloading pywavelets-1.

# Загрузка библиотек

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans
from geopy.distance import geodesic # геодезическое расстояние между точками по поверхности Земли
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from ydata_profiling import ProfileReport

%matplotlib inline

import warnings                                  # `do not disturbe` mode
warnings.filterwarnings('ignore')

# Загрузка данных

In [None]:
train = pd.read_csv("train.csv.zip", compression="zip")
test = pd.read_csv("test.csv")

In [None]:
train = train[:1000000]

In [None]:
profile = ProfileReport(train, title="Profiling Report")

# Очистка данных

In [None]:
pd.options.display.float_format = '{:.2f}'.format # отображение float

In [None]:
def box_plot(df):
    numerical_features=list(df.columns)
    numerical_features.remove('pickup_datetime')
    numerical_features.remove('key')
    print(numerical_features)
    print(df[numerical_features].describe())
    df[numerical_features].plot(kind='box', subplots=True, layout=(len(numerical_features) // 3 + 1, 3), figsize=(10, 10), sharex=False, sharey=False)

In [None]:
box_plot(train)

In [None]:
box_plot(test)

In [None]:
def clean_df(df):
    criteria = (
        " 0 < fare_amount <= 500"
        " and 0 < passenger_count <= 6 "
        " and -75 <= pickup_longitude <= -72 "
        " and -75 <= dropoff_longitude <= -72 "
        " and 40 <= pickup_latitude <= 42 "
        " and 40 <= dropoff_latitude <= 42 "
    )
    df = (df
          .dropna()
          .query(criteria)
          .reset_index()
         )
    return df

In [None]:
train = clean_df(train).drop(columns = ['index'])

In [None]:
box_plot(train)

# Временные признаки

In [None]:
def add_time_features(df):
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

    # Бинарные признаки
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)  # Суббота и воскресенье
    df['is_holiday'] = df['pickup_datetime'].dt.strftime('%m-%d').isin(['01-01', '07-04', '12-25']).astype(int)  # Пример праздников

    # Циклические признаки
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    df.drop('pickup_datetime', axis=1, inplace=True)

    return df

# Географические признаки

In [None]:
from matplotlib.colors import LinearSegmentedColormap

In [None]:
#поиск значимыых точек
plt.figure(figsize=(10,10))

cmap = LinearSegmentedColormap.from_list(name='name', colors=['green','yellow','red'])

f, ax = plt.subplots()
points = ax.scatter(train['dropoff_longitude']-train['pickup_longitude'], train['dropoff_latitude']-train['pickup_latitude'], c=train['fare_amount'],
                    s=10, cmap=cmap)
f.colorbar(points)

In [None]:
def distance(x1,y1,x2,y2):
    return  geodesic((x1,y1),(x2,y2)).km
    #  lambda x: np.sqrt((x['longitude'] - local_coord[0]) ** 2 + (x['latitude'] - local_coord[1]) ** 2), axis=1

In [None]:
def add_geo_features(df):
    # значимые места
    nyc=(40.724944, -74.001541)

    # расстояние между посадкой и высадкой
    df['distance'] = df.apply(
        lambda row: distance(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']),
        axis=1
    )
    # расстояние между точками
    df['abs_long_diff'] = np.abs(df.dropoff_longitude - df.pickup_longitude)
    df['abs_lat_diff'] = np.abs(df.dropoff_latitude - df.pickup_latitude)

    # Расстояние от точки посадки и высадки до Нью-Йорка
    df['pickup_distance_to_nyc'] = df.apply(
        lambda row: distance(nyc[0], nyc[1], row['pickup_latitude'], row['pickup_longitude']),
        axis=1
    )
    df['dropoff_distance_to_nyc'] = df.apply(
        lambda row: distance(nyc[0], nyc[1], row['dropoff_latitude'], row['dropoff_longitude']),
        axis=1
    )

    # Кластеризация K-средних
    kmeans_dropoff = KMeans(n_clusters=2)
    df['cluster_dropoff'] = kmeans_dropoff.fit_predict(df[['dropoff_latitude', 'dropoff_longitude']])

    kmeans_pickup = KMeans(n_clusters=2)
    df['cluster_pickup'] = kmeans_pickup.fit_predict(df[['pickup_latitude', 'pickup_longitude']])

    # Признаки синус-косинус для направления
    df['dr_lat_sin'] = np.sin(np.radians(df['dropoff_latitude']))
    df['dr_lat_cos'] = np.cos(np.radians(df['dropoff_latitude']))
    df['dr_lon_sin'] = np.sin(np.radians(df['dropoff_longitude']))
    df['dr_lon_cos'] = np.cos(np.radians(df['dropoff_longitude']))

    df['pi_lat_sin'] = np.sin(np.radians(df['pickup_latitude']))
    df['pi_lat_cos'] = np.cos(np.radians(df['pickup_latitude']))
    df['pi_lon_sin'] = np.sin(np.radians(df['pickup_longitude']))
    df['pi_lon_cos'] = np.cos(np.radians(df['pickup_longitude']))

    return df

# Подготовка датасетов

In [None]:
def final_desc(df):
    print(df)

    numerical_features=list(df.columns)
    numerical_features.remove('pickup_datetime')
    numerical_features.remove('key')

    print(numerical_features)

    print(df[numerical_features].describe())

    print(df.isna().sum().sum())

    df[numerical_features].hist(bins=50, figsize=(10, 10))

In [None]:
train = add_time_features(train)
test = add_time_features(test)

In [None]:
train = add_geo_features(train)
test = add_geo_features(test)

In [None]:
train

# Разделение данных

In [None]:
train.drop(['key'], axis=1, inplace=True)

In [None]:
x = train.drop(['fare_amount'], axis=1)
y = train['fare_amount']
#del(train)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, random_state=42, test_size=0.15)
del(x)
del(y)

# Создание и обучение модели

In [None]:
model = CatBoostRegressor(
    iterations=10000,
    learning_rate=0.05,
    depth=10,
    loss_function='RMSE',
    eval_metric='RMSE',
    task_type='GPU',
    random_seed=42,
    verbose=200
)

In [1]:
model.fit(x_train, y_train, eval_set=(x_val, y_val)) # model.save_model('my_model.cbm') model = CatBoostClassifier(init_model='my_model.cbm', iterations=200, depth=8, learning_rate=0.05, l2_leaf_reg=5)

NameError: name 'model' is not defined

# Предсказание

In [None]:
test_key = test['key']
x_pred = test.drop(columns=['key'], axis=1)

In [None]:
prediction = model.predict(x_pred)

In [None]:
submission = pd.DataFrame({
    'key': test['key'],
    'fare_amount': prediction,
}, columns=['key', 'fare_amount'])

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
from sklearn.model_selection import GridSearchCV

model = CatBoostClassifier(verbose=0)  # Укажите ваши категориальные признаки

param_grid = {
    'iterations': [100, 200],        # Количество итераций
    'depth': [6, 8, 10],             # Глубина деревьев
    'learning_rate': [0.01, 0.1],    # Скорость обучения
    'l2_leaf_reg': [3, 5, 7]         # Регуляризация
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

# 6. Обучение GridSearchCV
grid_search.fit(X_train, y_train)

# 7. Лучшие параметры и лучший результат
print("Лучшие параметры:", grid_search.best_params_)
print("Лучший результат:", grid_search.best_score_)

# 8. Оценка на тестовой выборке
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Тестовая точность:", accuracy)