In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan, normal_ad
from sklearn.preprocessing import OneHotEncoder
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [19]:
df = pd.read_csv('cars_clean_clean')

In [20]:
final_featues = ['log_age', 
               'brand_Geely', 
               'is_restyling', 
               'color_group_Холодные', 
               'log_power_2', 
               'city_group_Москва', 
               'gearbox_механика', 
               'drive_полный',
               'car_class_Crossover']
numeric_feats = ['mileage', 'power_1', 'power_2', 'age']
cat_feats = ['brand', 'color_group', 'gearbox', 'drive', 'fuel_type', 'city_group', 'car_class']
dummies = ['is_restyling', 'is_pro', 'is_max', 'is_premium']
target = 'price'

In [21]:
def prepare_data(df, target_col, categorical_cols, numeric_cols=None, dummies=None):
    """Подготовка данных с обработкой категориальных и числовых признаков"""
    # Создание dummy-переменных
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    encoded = encoder.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(encoded, 
                            columns=encoder.get_feature_names_out(categorical_cols))
    
    # Обработка числовых признаков
    if numeric_cols is None:
        numeric_cols = df.drop(columns=[target_col] + categorical_cols).columns
    
    X_numeric = df[numeric_cols].copy()
    X_dum = df[dummies].copy()
    
    # Создание преобразованных признаков
    X_transformed = pd.DataFrame()
    for col in numeric_cols:
        X_transformed[f'log_{col}'] = np.log1p(X_numeric[col])
        X_transformed[f'sq_{col}'] = X_numeric[col]**2
    
    # Объединение всех признаков
    X = pd.concat([X_numeric, X_transformed, encoded_df, X_dum], axis=1)
    y = df[target_col]
    
    return X, y, np.log1p(y), encoder

In [22]:
# Подготовка данных
X_full, y, log_y, encoder = prepare_data(df, target, cat_feats, numeric_feats, dummies)

In [24]:
X_final = sm.add_constant(X_full[final_featues])
model_linear_final = sm.OLS(log_y, X_final).fit(cov_type='HC3')  
print(model_linear_final.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.894
Model:                            OLS   Adj. R-squared:                  0.893
Method:                 Least Squares   F-statistic:                     1216.
Date:                Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                        08:35:32   Log-Likelihood:                 355.17
No. Observations:                1315   AIC:                            -690.3
Df Residuals:                    1305   BIC:                            -638.5
Df Model:                           9                                         
Covariance Type:                  HC3                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   11.3904 

# Новое наблюдение

In [55]:
model_linear_final.predict(X_final)

0       14.666750
1       13.567497
2       14.666750
3       14.673421
4       14.279631
          ...    
1310    15.174606
1311    14.762645
1312    14.680269
1313    14.492286
1314    15.256982
Length: 1315, dtype: float64

In [66]:
data = {
    'const': [1], 
    'log_age': [np.log(2)],
    'brand_Geely': [1],
    'is_restyling': [0],
    'color_group_Холодные': [1],
    'log_power_2': [np.log(361)],
    'city_group_Москва': [1],
    'gearbox_механика': [0],
    'drive_полный': [0],
    'car_class_Crossover': [0]
}
new_obs = pd.DataFrame(data)

pred = model_linear_final.predict(new_obs)

# Рассчитываем доверительный интервал
_, pred_lower, pred_upper = wls_prediction_std(model_linear_final, exog=new_obs, alpha=0.05)

print(f"Предсказанная цена (в логарифмах): {pred[0]:.4f}")
print(f"Доверительный интервал 95%: [{pred_lower[0]:.4f}, {pred_upper[0]:.4f}]")

# Преобразуем обратно из логарифмов
print(f"\nПредсказанная цена (в исходной шкале): {np.exp(pred[0]):.2f} RUB")
print(f"Доверительный интервал 95%: [{np.exp(pred_lower[0]):.2f}, {np.exp(pred_upper[0]):.2f}] RUB")

Предсказанная цена (в логарифмах): 15.2159
Доверительный интервал 95%: [14.8432, 15.5886]

Предсказанная цена (в исходной шкале): 4056637.95 RUB
Доверительный интервал 95%: [2794539.51, 5888738.17] RUB


In [45]:
new_obs 

Unnamed: 0,const,log_age,brand_Geely,is_restyling,color_group_Холодные,log_power_2,city_group_Москва,gearbox_механика,drive_полный,car_class_Crossover
0,1,1.609438,1,1,1,10.021271,1,0,1,0


In [36]:
X_full[final_featues].shape

(1315, 9)

In [37]:
X_

Unnamed: 0,const,log_age,brand_Geely,is_restyling,color_group_Холодные,log_power_2,city_group_Москва,gearbox_механика,drive_полный,car_class_Crossover
0,1.0,0.693147,0.0,0,0.0,5.181784,1.0,0.0,1.0,0.0
1,1.0,0.693147,0.0,0,0.0,4.736198,1.0,1.0,0.0,0.0
2,1.0,0.693147,0.0,0,0.0,5.181784,1.0,0.0,1.0,0.0
3,1.0,1.098612,1.0,1,1.0,5.303305,0.0,0.0,0.0,0.0
4,1.0,1.386294,0.0,1,0.0,4.997212,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1310,1.0,0.000000,1.0,1,0.0,5.476464,1.0,0.0,1.0,1.0
1311,1.0,0.693147,1.0,0,1.0,5.204007,1.0,0.0,0.0,0.0
1312,1.0,0.693147,1.0,0,0.0,5.204007,1.0,0.0,0.0,0.0
1313,1.0,1.386294,1.0,0,1.0,5.181784,0.0,0.0,1.0,0.0


In [39]:
sm.add_constant(new_obs)

Unnamed: 0,log_age,brand_Geely,is_restyling,color_group_Холодные,log_power_2,city_group_Москва,gearbox_механика,drive_полный,car_class_Crossover
0,1.609438,1,1,1,10.021271,1,0,1,0
