#### Loading the model - it's SARIMAX

In [291]:
import pickle

# Load the model
with open('arima_model.pkl', 'rb') as file:
    model = pickle.load(file)

print(type(model))

<class 'statsmodels.tsa.arima.model.ARIMAResultsWrapper'>


#### Getting to know the trained model

In [257]:
model.specification

{'seasonal_periods': 0,
 'measurement_error': False,
 'time_varying_regression': False,
 'simple_differencing': False,
 'enforce_stationarity': True,
 'enforce_invertibility': True,
 'hamilton_representation': False,
 'concentrate_scale': False,
 'trend_offset': 1,
 'order': (1, 1, 1),
 'seasonal_order': (0, 0, 0, 0),
 'k_diff': 1,
 'k_seasonal_diff': 0,
 'k_ar': 1,
 'k_ma': 1,
 'k_seasonal_ar': 0,
 'k_seasonal_ma': 0,
 'k_ar_params': 1,
 'k_ma_params': 1,
 'trend': 'n',
 'k_trend': 0,
 'k_exog': 0,
 'mle_regression': False,
 'state_regression': False}

In [258]:
model.data.endog

array([ 754.6793069 , 1499.29335814, 1048.69939846, 1394.97661636,
       1092.70379712, 1490.24105116, 1558.47719719, 1932.70686367,
       1198.4815614 , 1665.85119687, 1515.3797235 , 2094.81126194,
       1166.78979203, 1842.78325175, 1214.22888405, 1795.3538669 ,
        919.75473729, 1589.57199468, 1610.39426497, 1436.78976987,
       1122.10310613, 1344.37460557, 1169.13897813, 2027.22612542,
        759.33186385, 1517.80307699, 1536.88330068, 1699.95855624,
        856.09592473, 1481.3855282 , 1342.03707168, 1664.4863664 ])

### Conclusions:
1. Old data is not really time-series, so here I would gather more info from the creators of the model or data engineers. For purposes of this task, I am assuming quarterly intervals starting in 2003.
2. I need to do time series preprocessing on the training data, as SARIMAX is intended for timeseries.
3. Since I'll be dealing with time-series, I'll use rolling windows for cross-validation.
4. I will retrain the model on both old and new C1 data, not just apend, as I need to find best parameters along the way.

#### Reading the new data from C1

In [260]:
import pandas as pd
new_data = pd.read_csv("historical_sales_volume.csv")
new_data = new_data[new_data["product"].isin(["P1", "P2", "P3"])]
new_data = new_data.sort_values(by=['year', 'quarter'])
new_data = new_data.reset_index(drop=True)
new_data.head()

Unnamed: 0,year,quarter,product,volumeSales
0,2011,1,P1,164.946214
1,2011,1,P2,353.856667
2,2011,1,P3,235.876426
3,2011,2,P1,273.167296
4,2011,2,P2,643.979884


#### Preprocessing the new data, adding index from created date

In [276]:
def quarter_to_date(year: int, quarter: int):
    month = {1: '01', 2: '04', 3: '07', 4: '10'}[quarter]
    return pd.to_datetime(f'{year}-{month}-01')


def get_dates_as_index(data: pd.DataFrame, date_col: str, year_col: int, quarter_col: int, to_period: str):
    data['date'] = data.apply(lambda row: quarter_to_date(int(row[year_col]), row[quarter_col]), axis=1)
    data = data.groupby('date')['volumeSales'].sum()
    data.index = pd.DatetimeIndex(data.index).to_period(to_period)
    return data


new_series = get_dates_as_index(data=new_data, date_col="dates", year_col="year", quarter_col="quarter", to_period="Q")

new_series.head()

date
2011Q1     754.679307
2011Q2    1499.293358
2011Q3    1048.699398
2011Q4    1394.976616
2012Q1    1092.703797
Freq: Q-DEC, Name: volumeSales, dtype: float64

#### Preprocessing the old data

In [277]:
# Old data is not really time-series, I am assuming quarterly intervals starting in 2003.
original_series = pd.Series(original_endog)
original_series.head()

0     754.679307
1    1499.293358
2    1048.699398
3    1394.976616
4    1092.703797
dtype: float64

In [278]:
def convert_to_time_series(series: pd.Series, start_date: str, to_period: str):
    dates = pd.date_range(start=start_date, periods=len(series), freq=to_period)
    series = pd.Series(series.values, index=dates)
    series.index = pd.DatetimeIndex(series.index).to_period(to_period)
    return series

old_series = convert_to_time_series(series=original_series, start_date='2003-01-01', to_period="Q")
old_series.head()

2003Q1     754.679307
2003Q2    1499.293358
2003Q3    1048.699398
2003Q4    1394.976616
2004Q1    1092.703797
Freq: Q-DEC, dtype: float64

#### Combining old and new data

In [279]:
combined_series = pd.concat([old_series, new_series])
combined_series.head()

2003Q1     754.679307
2003Q2    1499.293358
2003Q3    1048.699398
2003Q4    1394.976616
2004Q1    1092.703797
Freq: Q-DEC, dtype: float64