In [None]:
import pandas as pd
import pmdarima as pm
from pmdarima.arima import ndiffs

from scipy import stats
import datetime
from dateutil import rrule

from time import sleep
from tqdm import tqdm
import warnings

def invboxcox(y,lmbda):
   if lmbda == 0:
      return(np.exp(y))
   else:
      return(np.exp(np.log(lmbda*y+1)/lmbda))


def weeks_between(start_date, end_date):
    weeks = rrule.rrule(rrule.WEEKLY, dtstart=start_date, until=end_date)
    return weeks.count()

warnings.filterwarnings('ignore')


In [None]:
news_all = pd.read_csv('news_with_clusters.csv', sep=',', index_col=0)
news_all = news_all[['text', 'CustomName', 'date']]

In [None]:
BLUE_GROUP = ['Политика', 'Уголовные дела', 'Крупные траты', 'Общество и санкции',
 'Уход компаний из России', 'Мероприятия', 'Политика', 'Законы и штрафы', 'Авиасообщение', 
 'Военные действия', 'Транспортные ограничения', 'Транспортная инфраструктура']

HEALTH_GROUP = ['Здравоохранение', 'Мероприятия', 'Общество', 'Цены', 'Авиасообщение']

In [None]:
news_all = news_all[news_all.CustomName.isin(BLUE_GROUP)]

In [None]:
news = news_all.copy()
weeks_in_year = 52

news.date = pd.to_datetime(news_all.date)
news.set_index("date", inplace=True)
df_news = news.resample("W").agg({"text": "count"})

df_news = df_news[df_news.index < '2020-01-01']

df_news["text_box"], lmbda = stats.boxcox(df_news["text"])
df_news["text_box_diff"] = df_news.text_box - df_news.text_box.shift(weeks_in_year)
df_news["text_box_diff2"] = df_news["text_box_diff"] - df_news[
    "text_box_diff"
].shift(1)
min_value = df_news["text_box_diff2"].min()
max_value = df_news["text_box_diff2"].max()
df_news["text_box_diff2"] = (df_news["text_box_diff2"] - min_value) / (
    max_value - min_value
)

df = df_news['text_box_diff2'].copy()

results = []

current_ts = df.dropna()
kpss_diffs = ndiffs(current_ts, alpha=0.05, test='kpss', max_d=6)
adf_diffs = ndiffs(current_ts, alpha=0.05, test='adf', max_d=6)
n_diffs = max(adf_diffs, kpss_diffs)

auto = pm.auto_arima(current_ts, d=n_diffs, seasonal=True, m=52, stepwise=True,
                  suppress_warnings=True, error_action="ignore", max_p=6,
                  max_order=None, trace=True)
start_aic = auto.aic()
start_params = auto.params().to_dict()
start_weights = auto

In [None]:
calculated_order = auto.order
calculated_seasonal_order = auto.seasonal_order

In [None]:
weeks_in_year = 52
models_list = []
datetime_mask = datetime.date(2019, 12, 29)
weeks_num = weeks_between(datetime.date(2019,12,29), datetime.date(year=2022, month=7, day=3))

c=0
with tqdm(total=weeks_num) as pbar:
    while datetime_mask <= datetime.date(year=2022, month=7, day=3):
        news = news_all.copy()
        mask = news["date"] <= str(datetime_mask)
        datetime_mask += pd.DateOffset(weeks=1)
        datetime_mask = datetime_mask.date()

        news = news.loc[mask]
        news.date = pd.to_datetime(news.date)
        news.set_index("date", inplace=True)
        df_news = news.resample("W").agg({"text": "count"})
        df_news["text_box"], lmbda = stats.boxcox(df_news["text"])
        df_news["text_box_diff"] = df_news.text_box - df_news.text_box.shift(weeks_in_year)
        df_news["text_box_diff2"] = df_news["text_box_diff"] - df_news[
            "text_box_diff"
        ].shift(1)
        min_value = df_news["text_box_diff2"].min()
        max_value = df_news["text_box_diff2"].max()
        df_news["text_box_diff2"] = (df_news["text_box_diff2"] - min_value) / (
            max_value - min_value
        )

        model =  pm.ARIMA(order=calculated_order, seasonal_order=calculated_seasonal_order).fit(df_news["text_box_diff2"])
        
        if c==0:
            start_params.update({'week': str(datetime.date(2019, 12, 29))})
            start_params.update({'aic': start_aic})
            parms_df = pd.DataFrame(start_params, index=[-1])

            params = model.params().to_dict()
            params.update({'week': str(datetime_mask)})
            params.update({'aic': model.aic()})
            new_vals = pd.DataFrame(params, index=[c])
            parms_df = pd.concat([parms_df, new_vals])
        else:
            params = model.params().to_dict()
            params.update({'week': str(datetime_mask)})
            params.update({'aic': model.aic()})
            new_vals = pd.DataFrame(params, index=[c])
            parms_df = pd.concat([parms_df, new_vals])

        c+=1
        sleep(0.1)
        pbar.update(1)
        pbar.set_description(f"Processed {datetime_mask}")

In [None]:
parms_df = parms_df.drop(columns=['intercept']).set_index('week')

In [None]:
parms_df.aic.plot()

In [None]:
parms_df.iloc[:-1,:].to_csv('params_blue_14122023.csv')

In [None]:
parms_df