In [153]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error as rmse

In [154]:
df = pd.read_csv('Walmart.csv')
df

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


In [155]:
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df['DateSec'] = (df['Date']).astype(np.int64)/1000000000
df.sort_values(by=['Date', 'Store'], ascending=True, inplace=True)
df

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,DateSec
0,1,2010-02-05,1643690.90,0,42.31,2.572,211.096358,8.106,1.265328e+09
143,2,2010-02-05,2136989.46,0,40.19,2.572,210.752605,8.324,1.265328e+09
286,3,2010-02-05,461622.22,0,45.71,2.572,214.424881,7.368,1.265328e+09
429,4,2010-02-05,2135143.87,0,43.76,2.598,126.442065,8.623,1.265328e+09
572,5,2010-02-05,317173.10,0,39.70,2.572,211.653972,6.566,1.265328e+09
...,...,...,...,...,...,...,...,...,...
5862,41,2012-10-26,1316542.59,0,41.80,3.686,199.219532,6.195,1.351210e+09
6005,42,2012-10-26,514756.08,0,70.50,4.301,131.193097,6.943,1.351210e+09
6148,43,2012-10-26,587603.55,0,69.17,3.506,214.741539,8.839,1.351210e+09
6291,44,2012-10-26,361067.07,0,46.97,3.755,131.193097,5.217,1.351210e+09


In [156]:
train_x, test_x, train_y, test_y = train_test_split(df.drop(columns='Weekly_Sales'), df['Weekly_Sales'], test_size=0.2, random_state=42, shuffle=False)

In [157]:
fig = go.Figure()

fig.add_trace(go.Scatter(
                    x=train_x['Date'],
                    y=train_y,
                    line=dict(color='green', width=2),
                    name='Данные для обучения',
                   )
)
fig.add_trace(go.Scatter(
                    x=test_x['Date'],
                    y=test_y,
                    line=dict(color='red', width=2),
                    name='Данные для верификации',
                    )
)

fig.show()

In [158]:
train_x.columns

Index(['Store', 'Date', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI',
       'Unemployment', 'DateSec'],
      dtype='object')

# МНК

In [170]:
from scipy.optimize import curve_fit 

X = train_x.drop(columns='Date').values
y = train_y.values

# Определяем функцию для нескольких переменных
# Правильный подход — передавать параметры как один массив
def mapping_func(X, *params):
    return np.dot(X, params[:-1]) + params[-1]

# Начальные значения коэффициентов (можно подобрать лучше)
initial_guess = [1] * (X.shape[1] + 1)


params_opt, params_cov = curve_fit(mapping_func, X, y, p0=initial_guess)

print("Оптимальные параметры:", params_opt)

# Для предсказаний
y_pred_train = mapping_func(train_x.drop(columns='Date'), *params_opt)
y_pred = mapping_func(test_x.drop(columns='Date'), *params_opt)

fig = go.Figure()

fig.add_trace(go.Scatter(
                    x=train_x['Date'],
                    y=train_y,
                    line=dict(color='green', width=2),
                    name='Данные для обучения', 
                   )
)

fig.add_trace(go.Scatter(
                    x=test_x['Date'],
                    y=test_y,
                    line=dict(color='red', width=2),
                    name='Тестовые данные',
                    )
)

fig.add_trace(go.Scatter(
                    x=pd.concat([train_x['Date'], test_x['Date']], ignore_index=True),
                    y=list(y_pred_train) + list(y_pred),
                    line=dict(color='blue', width=2),
                    name='МНК',
                    )
)


fig.show()
print('RMSE: ', rmse(test_y, y_pred))

Оптимальные параметры: [-1.56012972e+04  7.72171192e+04 -8.56080686e+02  1.37303437e+04
 -2.36825398e+03 -2.13327874e+04  1.54764019e-04  1.78389949e+06]


RMSE:  493017.01200154534


In [164]:
from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=3)

neigh.fit(train_x.drop(columns='Date'), train_y)

y_pred_train = neigh.predict(train_x.drop(columns='Date'))
y_pred = neigh.predict(test_x.drop(columns='Date'))

fig = go.Figure()

fig.add_trace(go.Scatter(
                    x=train_x['Date'],
                    y=train_y,
                    line=dict(color='green', width=2),
                    name='Данные для обучения', 
                   )
)

fig.add_trace(go.Scatter(
                    x=test_x['Date'],
                    y=test_y,
                    line=dict(color='red', width=2),
                    name='Тестовые данные',
                    )
)

fig.add_trace(go.Scatter(
                    x=pd.concat([train_x['Date'], test_x['Date']], ignore_index=True),
                    y=pd.concat([pd.Series(y_pred_train), pd.Series(y_pred)]),
                    line=dict(color='blue', width=2),
                    name='KNN3',
                    )
)
fig.show()
print('RMSE: ', rmse(test_y, y_pred))

RMSE:  642473.0700022383


In [168]:
from tqdm.auto import tqdm
import pmdarima as pm
import statsmodels.api as sm

stores = train_x['Store'].unique()


train_pred = pd.DataFrame({'y': [], 'Store': [], 'Date': []})
test_pred = pd.DataFrame({'y': [], 'Store': [], 'Date': []})
for store in tqdm(stores):
    store_data = train_x[train_x['Store'] == store].drop(columns=['DateSec'])
    store_y = train_y[train_x['Store'] == store]
    
    store_data.set_index('Date', inplace=True)
    store_y.index = store_data.index
    
    store_data = store_data.asfreq('W-FRI')
    store_y = store_y.asfreq('W-FRI')
    
    exog = store_data.drop(columns=['Store'], errors='ignore')
    
    auto_model = pm.auto_arima(
        store_y,
        exogenous=exog,
        seasonal=True,       
        stepwise=True,
        suppress_warnings=True,
        trace=False
    )
    
    order = auto_model.order
    model = sm.tsa.ARIMA(endog=store_y, exog=exog, order=order)
    result = model.fit()

    y_pred = result.predict(start=store_y.index[0], end=store_y.index[-1], exog=exog)
    pred_df = pd.DataFrame({'y': y_pred, 'Store': [store]*len(y_pred), 'Date': store_data.index})
    train_pred = pd.concat([train_pred, pred_df], ignore_index=True)
    
    # Для теста
    store_data = test_x[test_x['Store'] == store].drop(columns=['DateSec'])
    store_y = test_y[test_x['Store'] == store]
    
    store_data.set_index('Date', inplace=True)
    store_y.index = store_data.index
    
    store_data = store_data.asfreq('W-FRI')
    store_y = store_y.asfreq('W-FRI')
    exog = store_data.drop(columns=['Store'], errors='ignore')
    y_pred = result.predict(start=store_y.index[0], end=store_y.index[-1], exog=exog)
    pred_df = pd.DataFrame({'y': y_pred, 'Store': [store]*len(y_pred), 'Date': store_data.index})
    test_pred = pd.concat([test_pred, pred_df], ignore_index=True)


  0%|          | 0/45 [00:00<?, ?it/s]


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.


Non-invertible starting MA parameters found. Using zeros as starting parameters.


Maximum Likelihood optimization failed to converge. Check mle_retvals


Maximum Likelihood optimization failed to converge. Check mle_retvals



In [169]:
train_pred.sort_values(by=['Date', 'Store'], inplace=True)
test_pred.sort_values(by=['Date', 'Store'], inplace=True)
total_pred = pd.concat([train_pred, test_pred], ignore_index=True)

fig = go.Figure()

fig.add_trace(go.Scatter(
                    x=train_x['Date'],
                    y=train_y,
                    line=dict(color='green', width=2),
                    name='Данные для обучения', 
                   )
)

fig.add_trace(go.Scatter(
                    x=test_x['Date'],
                    y=test_y,
                    line=dict(color='red', width=2),
                    name='Тестовые данные',
                    )
)

fig.add_trace(go.Scatter(
                    x=total_pred['Date'],
                    y=total_pred['y'],
                    line=dict(color='blue', width=2),
                    name='ARIMAX',
                    )
)
fig.show()
print('RMSE: ', rmse(test_y, test_pred['y']))

RMSE:  148920.21357010503
