In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import joblib
import pulp
import itertools

In [27]:
DATA_PATH = 'transport_dataset_ml.csv'
OUTPUT_DIR = 'model_outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [28]:
df = pd.read_csv(DATA_PATH)
X = df.drop('price_rub', axis=1)
y = df['price_rub']

In [5]:
numeric_features     = ['distance_km', 'weight_tons', 'volume_m3', 'fuel_price']
categorical_features = ['origin_city', 'destination_city', 'cargo_type', 'transport_type', 'season', 'day_of_week']
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
], remainder='drop')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, force_col_wise='true', random_state=42),
    'MLP': MLPRegressor(random_state=42, max_iter=500, early_stopping=True)
}


In [8]:
cv_results = []
test_results = []

for name, model in models.items():
    pipe = Pipeline([('preproc', preprocessor), ('model', model)])
    
    mae_cv  = -cross_val_score(pipe, X_train, y_train, cv=5,
                               scoring='neg_mean_absolute_error').mean()
    rmse_cv = np.sqrt(-cross_val_score(pipe, X_train, y_train, cv=5,
                                       scoring='neg_mean_squared_error').mean())
    r2_cv   = cross_val_score(pipe, X_train, y_train, cv=5,
                              scoring='r2').mean()
    cv_results.append({'Model': name, 'CV MAE': round(mae_cv,2),
                       'CV RMSE': round(rmse_cv,2), 'CV R2': round(r2_cv,2)})
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    mae, rmse, r2 = (mean_absolute_error(y_test, y_pred),
                     np.sqrt(mean_squared_error(y_test, y_pred)),
                     r2_score(y_test, y_pred))
    test_results.append({'Model': name, 'MAE': round(mae,2),
                         'RMSE': round(rmse,2), 'R2': round(r2,2)})
    
    plt.figure(figsize=(5,5))
    plt.scatter(y_test, y_pred, s=10, alpha=0.6)
    mn, mx = min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())
    plt.plot([mn, mx], [mn, mx], '--', c='gray')
    plt.xlabel('Фактические цены')
    plt.ylabel('Прогнозные цены')
    plt.title(f'{name}: Фактические vs Прогнозные')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, f'{name}_scatter.png'))
    plt.close()
    
    if hasattr(model, 'feature_importances_') or hasattr(pipe.named_steps['model'], 'feature_importances_'):
        feat_imp = pipe.named_steps['model'].feature_importances_
        cat_names = pipe.named_steps['preproc']\
            .named_transformers_['cat']\
            .get_feature_names_out(categorical_features)
        feat_names = numeric_features + list(cat_names)
        idx = np.argsort(feat_imp)[-10:]
        plt.figure(figsize=(6,4))
        plt.barh(np.array(feat_names)[idx], feat_imp[idx])
        plt.xlabel('Важность признака')
        plt.title(f'{name}: Топ-10 признаков')
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'{name}_featimp.png'))
        plt.close()

cv_df   = pd.DataFrame(cv_results).set_index('Model')
test_df = pd.DataFrame(test_results).set_index('Model')

cv_df.to_csv(os.path.join(OUTPUT_DIR, 'cv_results.csv'))
test_df.to_csv(os.path.join(OUTPUT_DIR, 'test_results.csv'))

print("Кросс-валидация:\n", cv_df)
print("\nТестовые результаты:\n", test_df)
print(f"\nГрафики и важности сохранены в папке: {OUTPUT_DIR}")

[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4659.171268




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4658.764364




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4655.543410




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4658.786494




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4647.185876




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4659.171268




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4658.764364




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4655.543410




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4658.786494




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4647.185876




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4659.171268




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4658.764364




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4655.543410




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4658.786494




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 33
[LightGBM] [Info] Start training from score 4647.185876




[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 33
[LightGBM] [Info] Start training from score 4655.890283




Кросс-валидация:
                   CV MAE  CV RMSE  CV R2
Model                                   
LinearRegression  770.19  1089.25   0.89
RandomForest       22.38    30.86   1.00
XGBoost            25.20    33.74   1.00
LightGBM           31.75    42.33   1.00
MLP               100.91   139.95   1.00

Тестовые результаты:
                      MAE     RMSE    R2
Model                                  
LinearRegression  768.30  1084.61  0.89
RandomForest       21.22    29.14  1.00
XGBoost            24.51    32.70  1.00
LightGBM           31.08    41.24  1.00
MLP                70.13    94.92  1.00

Графики и важности сохранены в папке: model_outputs


In [9]:
for name, model in models.items():
    pipe = Pipeline([('preproc', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    if name == 'XGBoost':
        joblib.dump(pipe, 'xgboost_pipeline.pkl')
        print("Сохранён XGBoost-пайплайн в файл xgboost_pipeline.pkl")

Сохранён XGBoost-пайплайн в файл xgboost_pipeline.pkl
[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 33
[LightGBM] [Info] Start training from score 4655.890283


In [29]:
model = joblib.load('xgboost_pipeline.pkl')
df    = pd.read_csv('transport_dataset_ml.csv')

In [30]:
supply   = df.groupby('origin_city')['weight_tons'].sum().to_dict()
demand   = df.groupby('destination_city')['weight_tons'].sum().to_dict()
suppliers  = list(supply.keys())
consumers  = list(demand.keys())

In [31]:
grid = pd.DataFrame(
    itertools.product(suppliers, consumers),
    columns=['origin_city','destination_city']
)

In [32]:
numeric_cols = ['distance_km','weight_tons','volume_m3','fuel_price']
agg_num = (
    df
    .groupby(['origin_city','destination_city'])[numeric_cols]
    .mean()
    .reset_index()
)
categorical_cols = ['cargo_type','transport_type','season','day_of_week']
agg_cat = (
    df
    .groupby(['origin_city','destination_city'])[categorical_cols]
    .agg(lambda x: x.mode().iat[0])
    .reset_index()
)


In [33]:
grid = (
    grid
    .merge(agg_num, on=['origin_city','destination_city'], how='left')
    .merge(agg_cat, on=['origin_city','destination_city'], how='left')
)


In [34]:
avg = (
    df
    .groupby(['origin_city','destination_city'])['price_rub']
    .mean()
    .reset_index(name='avg_cost')
)
grid = grid.merge(avg, on=['origin_city','destination_city'], how='left')

In [35]:
grid['pred_cost'] = model.predict(grid.drop(columns=['avg_cost']))
grid

Unnamed: 0,origin_city,destination_city,distance_km,weight_tons,volume_m3,fuel_price,cargo_type,transport_type,season,day_of_week,avg_cost,pred_cost
0,Екатеринбург,Екатеринбург,1510.14666,10.26722,50.657626,59.619687,общий,авиа,зима,Сб,4550.812753,6471.020508
1,Екатеринбург,Казань,1545.446246,10.142774,50.476433,60.390973,общий,авиа,зима,Пн,4652.023392,6672.660645
2,Екатеринбург,Москва,1539.615022,10.242313,49.839631,60.128426,опасный,морской,осень,Ср,4579.449978,4034.02417
3,Екатеринбург,Новосибирск,1556.96109,10.227408,51.10973,59.923487,общий,жд,лето,Пт,4685.611242,3002.842529
4,Екатеринбург,Санкт-Петербург,1569.300346,10.378346,50.349294,60.231478,опасный,жд,зима,Чт,4666.061133,4451.127441
5,Казань,Екатеринбург,1556.824755,10.259885,51.405697,59.959402,опасный,жд,осень,Пт,4717.597951,4451.127441
6,Казань,Казань,1558.821383,10.280469,49.378214,60.19303,хрупкий,авто,весна,Чт,4660.148168,3658.835449
7,Казань,Москва,1546.668917,10.38665,51.034687,59.670431,скоропортящийся,авто,осень,Вс,4585.354726,3954.361084
8,Казань,Новосибирск,1550.740246,10.31845,50.249612,60.087461,опасный,авто,зима,Ср,4649.967716,5012.309082
9,Казань,Санкт-Петербург,1542.520099,10.152242,50.843909,60.166392,опасный,морской,весна,Вт,4651.959411,4036.223389


In [36]:
C_bar = {
    (i,j): float(grid.loc[
        (grid.origin_city==i)&(grid.destination_city==j), 'avg_cost'
    ].iat[0])
    for i,j in itertools.product(suppliers, consumers)
}
C_hat = {
    (i,j): float(grid.loc[
        (grid.origin_city==i)&(grid.destination_city==j), 'pred_cost'
    ].iat[0])
    for i,j in itertools.product(suppliers, consumers)
}


In [37]:
def solve_transport(C):
    prob = pulp.LpProblem('Transport_Problem', pulp.LpMinimize)
    x = pulp.LpVariable.dicts('x', (suppliers, consumers), lowBound=0, cat='Continuous')
    # Целевая функция
    prob += pulp.lpSum(C[i,j] * x[i][j] for i,j in itertools.product(suppliers, consumers))
    # Запасы
    for i in suppliers:
        prob += pulp.lpSum(x[i][j] for j in consumers) == supply[i]
    # Спрос
    for j in consumers:
        prob += pulp.lpSum(x[i][j] for i in suppliers) == demand[j]
    prob.solve(pulp.PULP_CBC_CMD(msg=False))
    return pulp.value(prob.objective)

In [38]:
Z_base = solve_transport(C_bar)
Z_ml   = solve_transport(C_hat)

print(f"Baseline (avg cost): Z = {Z_base:.2f}")
print(f"ML-predicted cost:   Z = {Z_ml:.2f}")
print(f"Absolute difference: ΔZ = {Z_base - Z_ml:.2f}")
print(f"Relative change:     {(Z_base - Z_ml)/Z_base*100:.2f}%")

Baseline (avg cost): Z = 4724959916.99
ML-predicted cost:   Z = 3480245996.87
Absolute difference: ΔZ = 1244713920.12
Relative change:     26.34%
