In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 100)
pd.set_option('display.max_rows', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train.csv")

In [3]:
static_df = train[['merchant_id', 'mcc_id', 'settlement_period', 'working_type', 'merchant_segment']]
train = train[['merchant_id', 'month_id', 'net_payment_count']]
train = train.sort_values(by=['merchant_id', 'month_id']).reset_index(drop=True)
train['month_id'] = pd.to_datetime(train['month_id'], format='%Y%m')

In [4]:
def filter_by_multiple_gaps(df, threshold):
    # month_id sütununu datetime'a çevirme
    
    result_df = pd.DataFrame()  # Sonuçları saklamak için boş bir dataframe
    
    for merchant_id, group in df.groupby('merchant_id'):
        group = group.sort_values(by='month_id')  # Her grubu tarihe göre sıralama
        valid_indices = []  # Geçerli indeksleri saklamak için boş bir liste
        
        prev_date = None
        for index, row in group.iterrows():
            if prev_date is not None:
                date_diff = (row['month_id'] - prev_date).days
                if date_diff < threshold:
                    # Eğer fark belirlenen eşikten küçükse, bu indeksi sakla
                    valid_indices.append(index)
                else:
                    # Eşikten büyük bir fark bulunduğunda, geçerli indeksleri sıfırla ve bu indeksi ekle
                    valid_indices = [index]
            else:
                # İlk satır her zaman geçerli olarak kabul edilir
                valid_indices.append(index)
            prev_date = row['month_id']
        
        # Geçerli indekslere sahip satırları sonuç DataFrame'ine ekle
        result_df = pd.concat([result_df, group.loc[valid_indices]])
    
    return result_df.reset_index(drop=True)

train = filter_by_multiple_gaps(train, 240)

In [5]:
# Son transaction ve ilk gözlem tarihlerini bulma
last_transaction = train.groupby('merchant_id')['month_id'].max()
first_observation = train.groupby('merchant_id')['month_id'].min()

# Model DataFrames
# model_2020_df = train[train['merchant_id'].isin(first_observation[first_observation <= '2020-12-01'].index)]
model_2021_df = train[train['merchant_id'].isin(first_observation[(first_observation > '2020-12-01') & (first_observation <= '2022-12-01')].index)]
# model_2022_df = train[train['merchant_id'].isin(first_observation[(first_observation > '2022-12-01') & (first_observation <= '2022-09-01')].index)]
# model_2023_df = train[train['merchant_id'].isin(first_observation[first_observation > '2022-12-01'].index)]

# Her merchant_id için gözlem sayısını say
merchant_counts = model_2021_df['merchant_id'].value_counts()

# Her merchant_id için en son gözlem tarihini bul
last_observation = model_2021_df.groupby('merchant_id')['month_id'].max()

# Gözlem sayısı 3'ten az olan veya son gözlem tarihi 202301'den düşük olan merchant_id'leri bul
filtered_merchant_ids = merchant_counts[(last_observation <= '2023-06-01')].index
# (merchant_counts <= 5) | 
# Bu merchant_id'leri no_model DataFrame'ine ata
churn_df = model_2021_df[model_2021_df['merchant_id'].isin(filtered_merchant_ids)]

# Diğer verileri updated_train DataFrame'ine ata
model_2021_df = model_2021_df[~model_2021_df['merchant_id'].isin(filtered_merchant_ids)]

# # Her merchant_id için toplam geçme sayısını hesaplama
# merchant_pass_counts = model_2021_df.groupby('merchant_id').size()

# # Geçme sayısı 5'ten az olan merchant_id'leri bulma
# rule_based_merchant_ids = merchant_pass_counts[merchant_pass_counts <= 5].index

# # Bu merchant_id'leri rule_based_df DataFrame'ine ata
# rule_based_df = model_2021_df[model_2021_df['merchant_id'].isin(rule_based_merchant_ids)]

# # Bu merchant_id'leri model_2021_df DataFrame'inden çıkar
# model_2021_df = model_2021_df[~model_2021_df['merchant_id'].isin(rule_based_merchant_ids)]

# Set ve DataFrames'in boyutlarını kontrol etme
churn_df.shape, model_2021_df.shape

((32454, 3), (61015, 3))

In [6]:
# import pandas as pd
# import numpy as np

# def detect_and_cap_outliers(df):
#     df['rolling_mean'] = df['net_payment_count'].rolling(window=8, min_periods=1).mean()
#     df['rolling_std'] = df['net_payment_count'].rolling(window=8, min_periods=1).std()  # ddof=0 for population std
#     df['lower_limit'] = df['rolling_mean'] - 4 * df['rolling_std']
#     df['upper_limit'] = df['rolling_mean'] + 4 * df['rolling_std']
    
#     # Aykırı değerleri tespit
#     df['is_outlier'] = (df['net_payment_count'] < df['lower_limit']) | (df['net_payment_count'] > df['upper_limit'])
    
#     # Aykırı değerleri baskıla
#     df['net_payment_count'] = np.where(df['net_payment_count'] < df['lower_limit'], df['lower_limit'],
#                                               np.where(df['net_payment_count'] > df['upper_limit'], df['upper_limit'],
#                                                        df['net_payment_count']))
#     return df

# # Her bir merchant_id için aykırı değerleri tespit etmek, baskılamak ve sonuçları görmek
# merchant_ids = model_2021_df['merchant_id'].unique()
# capped_results = []  # Sonuçları saklamak için boş bir liste

# for merchant_id in merchant_ids:
#     merchant_df = model_2021_df[model_2021_df['merchant_id'] == merchant_id].copy()
#     capped_df = detect_and_cap_outliers(merchant_df)
#     capped_results.append(capped_df)

# # Sonuçların birleştirilmesi
# model_2021_df = pd.concat(capped_results).reset_index(drop=True)
# model_2021_df = model_2021_df[['merchant_id', 'month_id' ,'net_payment_count']]

# # İlk birkaç sonucu göster
# model_2021_df.head()

In [7]:
model_2021_df[model_2021_df['merchant_id'] == 'merchant_6283']

Unnamed: 0,merchant_id,month_id,net_payment_count
238162,merchant_6283,2022-11-01,3
238163,merchant_6283,2022-12-01,3
238164,merchant_6283,2023-02-01,5
238165,merchant_6283,2023-03-01,6
238166,merchant_6283,2023-04-01,3
238167,merchant_6283,2023-06-01,4
238168,merchant_6283,2023-09-01,5


In [8]:
model_2021_df[model_2021_df['merchant_id'] == 'merchant_37330']

Unnamed: 0,merchant_id,month_id,net_payment_count
121790,merchant_37330,2022-10-01,3
121791,merchant_37330,2023-02-01,5
121792,merchant_37330,2023-03-01,4
121793,merchant_37330,2023-04-01,4
121794,merchant_37330,2023-06-01,2
121795,merchant_37330,2023-07-01,2


In [9]:
model_2021_df['merchant_id'].value_counts().tail(200)

merchant_id
merchant_2364     6
merchant_45743    5
merchant_61829    5
merchant_13916    5
merchant_30741    5
merchant_8174     5
merchant_64912    5
merchant_55158    5
merchant_36516    5
merchant_16776    5
merchant_53523    5
merchant_55305    5
merchant_42925    5
merchant_50359    5
merchant_14101    5
merchant_12808    5
merchant_651      5
merchant_29719    5
merchant_13056    5
merchant_22302    5
merchant_22340    5
merchant_33461    5
merchant_18301    5
merchant_45012    5
merchant_51523    5
merchant_26345    5
merchant_30309    5
merchant_35461    5
merchant_27726    5
merchant_7847     5
merchant_54655    5
merchant_53785    5
merchant_24872    5
merchant_61604    5
merchant_17499    5
merchant_44203    5
merchant_47832    5
merchant_60097    5
merchant_36554    5
merchant_9252     5
merchant_49289    5
merchant_4180     5
merchant_23692    5
merchant_31786    5
merchant_22891    5
merchant_48802    5
merchant_11265    5
merchant_56746    5
merchant_46434    5
merchant

In [10]:
model_2021_df.net_payment_count.mean()

256.9006801606162

In [11]:
# Gözlem sayılarını hesaplama
merchant_observation_counts = model_2021_df['merchant_id'].value_counts()

# 11'den fazla gözlem içeren merchant'ları belirleme
merchants_more_than_11 = merchant_observation_counts[merchant_observation_counts >= 14].index

# 11'den az gözlem içeren merchant'ları belirleme
merchants_less_than_11 = merchant_observation_counts[merchant_observation_counts < 14].index

# Bu merchant'ların gözlemlerini ilgili DataFrame'lere ayırma
model_2021_df_up = model_2021_df[model_2021_df['merchant_id'].isin(merchants_more_than_11)]
model_2021_df_down = model_2021_df[model_2021_df['merchant_id'].isin(merchants_less_than_11)]

# Sonuçların boyutlarını kontrol etme
model_2021_df_up.shape, model_2021_df_down.shape

((48651, 3), (12364, 3))

In [12]:
unique_merchant_ids = churn_df['merchant_id'].unique()

# Her merchant_id için 2023-10, 2023-11, ve 2023-12 tarihleri için id oluştur
submission_entries = []
for merchant_id in unique_merchant_ids:
    for month in ['10', '11', '12']:
        submission_id = f'2023{month}{merchant_id}'
        submission_entries.append([submission_id, merchant_id, None])

# sub_no_model DataFrame'ini oluştur
churn = pd.DataFrame(submission_entries, columns=['id', 'merchant_id', 'net_payment_count'])

churn.head(20)

Unnamed: 0,id,merchant_id,net_payment_count
0,202310merchant_1,merchant_1,
1,202311merchant_1,merchant_1,
2,202312merchant_1,merchant_1,
3,202310merchant_10,merchant_10,
4,202311merchant_10,merchant_10,
5,202312merchant_10,merchant_10,
6,202310merchant_10002,merchant_10002,
7,202311merchant_10002,merchant_10002,
8,202312merchant_10002,merchant_10002,
9,202310merchant_10007,merchant_10007,


In [13]:
churn['net_payment_count'] = 0
churn = churn[['id', 'net_payment_count']]
churn.head(3)

Unnamed: 0,id,net_payment_count
0,202310merchant_1,0
1,202311merchant_1,0
2,202312merchant_1,0


In [14]:
def filler(df):
    # Define the target date
    target_date = pd.to_datetime('2023-09-01')

    # Find merchants that don't have a record on the target date
    merchants_missing_target_date = df[~df['merchant_id'].isin(
        df[df['month_id'] == target_date]['merchant_id']
    )]['merchant_id'].unique()

    # Get the first record of each merchant to preserve the static features
    first_records_per_merchant = df[df['merchant_id'].isin(merchants_missing_target_date)].groupby('merchant_id').first().reset_index()

    # Create missing records for the target date
    missing_records = first_records_per_merchant.copy()
    missing_records['month_id'] = target_date
    missing_records['net_payment_count'] = 0

    # Append the missing records to the original DataFrame
    df = pd.concat([df, missing_records], ignore_index=True)

    # Sort the updated DataFrame
    df.sort_values(by=['merchant_id', 'month_id'], inplace=True)
    
    return df

model_2021_df_up = filler(model_2021_df_up)  
model_2021_df_down = filler(model_2021_df_down)  

In [15]:
model_2021_df_up[model_2021_df_up['month_id'] >= '2023-07-01']['net_payment_count'].mean()

435.52402597402596

In [16]:
static_features_df = static_df.drop_duplicates()

In [17]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

model_2021_up_data = TimeSeriesDataFrame.from_data_frame(
    model_2021_df_up,
    id_column="merchant_id",
    timestamp_column="month_id",
    static_features_df=static_features_df
)
model_2021_up_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,net_payment_count
item_id,timestamp,Unnamed: 2_level_1
merchant_10027,2022-02-01,5
merchant_10027,2022-03-01,7
merchant_10027,2022-04-01,4
merchant_10027,2022-05-01,6
merchant_10027,2022-06-01,11


In [18]:
model_2021_up_data = model_2021_up_data.convert_frequency(freq="M")

In [19]:
model_2021_up_data['net_payment_count'] = model_2021_up_data['net_payment_count'].fillna(0)

In [20]:
import holidays

timestamps = model_2021_up_data.index.get_level_values("timestamp")
country_holidays = holidays.country_holidays(
    country="TR",
    years=range(timestamps.min().year, timestamps.max().year + 1),
)
pd.Series(country_holidays).sort_index()

2021-01-01                                       New Year's Day
2021-04-23              National Sovereignty and Children's Day
2021-05-01                                           Labour Day
2021-05-13                                        Ramadan Feast
2021-05-14                                Ramadan Feast Holiday
2021-05-15                                Ramadan Feast Holiday
2021-05-19       Commemoration of Ataturk, Youth and Sports Day
2021-07-15                     Democracy and National Unity Day
2021-07-20                                      Sacrifice Feast
2021-07-21                              Sacrifice Feast Holiday
2021-07-22                              Sacrifice Feast Holiday
2021-07-23                              Sacrifice Feast Holiday
2021-08-30                                          Victory Day
2021-10-29                                         Republic Day
2022-01-01                                       New Year's Day
2022-04-23              National Soverei

In [21]:
import datetime
# Yukarıda bahsettiğim grafikteki minimum-maksimum değerleri:
custom_dates = {
    datetime.date(2021, 1, 1): "Lowest",
    datetime.date(2021, 5, 1): "Highest",
    datetime.date(2021, 7, 1): "Lowest",
    datetime.date(2021, 11, 1): "Highest",
    datetime.date(2022, 1, 1): "Lowest",
    datetime.date(2023, 3, 1): "Lowest",
}
'''
    datetime.date(2020, 4, 1): "Corona",
    datetime.date(2020, 5, 1): "Corona",
    datetime.date(2020, 6, 1): "Corona",
    # datetime.date(2021, 4, 1): "Corona",
    # datetime.date(2021, 5, 1): "Corona",
    datetime.date(2023, 3, 1): "Deprem", Minimum degerlerden biri buraya tekabul ediyor
    datetime.date(2023, 5, 1): "Secim",
    datetime.date(2023, 6, 1): "Secim",
'''

'\n    datetime.date(2020, 4, 1): "Corona",\n    datetime.date(2020, 5, 1): "Corona",\n    datetime.date(2020, 6, 1): "Corona",\n    # datetime.date(2021, 4, 1): "Corona",\n    # datetime.date(2021, 5, 1): "Corona",\n    datetime.date(2023, 3, 1): "Deprem", Minimum degerlerden biri buraya tekabul ediyor\n    datetime.date(2023, 5, 1): "Secim",\n    datetime.date(2023, 6, 1): "Secim",\n'

In [22]:
merged_dates = country_holidays.copy()
for date, event in custom_dates.items():
    if date in merged_dates:
        merged_dates[date] += ", " + event
    else:
        merged_dates[date] = event

In [23]:
def add_date_features(
    ts_df: TimeSeriesDataFrame, 
    country_holidays: dict,
    include_individual_holidays: bool = True,
    include_holiday_indicator: bool = True,
) -> TimeSeriesDataFrame:
    """Tatil günleri ve bahsedilen maksimum-minimum degerleri ekle"""
    ts_df = ts_df.copy()
    timestamps = ts_df.index.get_level_values("timestamp")
    country_holidays_df = pd.get_dummies(pd.Series(country_holidays)).astype(float)
    holidays_df = country_holidays_df.reindex(timestamps.date).fillna(0)
    if include_individual_holidays:
        ts_df[holidays_df.columns] = holidays_df.values
    if include_holiday_indicator:
        ts_df["Holiday"] = holidays_df.max(axis=1).values
    return ts_df

In [24]:
model_2021_up_data = add_date_features(model_2021_up_data, merged_dates)

In [25]:
holiday_columns = model_2021_up_data.columns.to_list()
holiday_columns.remove('net_payment_count')

In [26]:
from autogluon.common import space

predictor = TimeSeriesPredictor(
    prediction_length=3,
    target="net_payment_count",
    eval_metric="MAE",
    freq='M',
    quantile_levels=[0.3, 0.31, 0.32, 0.33, 0.34 ,0.35, 0.4, 0.5],
#     known_covariates_names=["month", "year"]
)

predictor.fit(
    model_2021_up_data,
    presets="best_quality",
    time_limit= 3600 * 6,
    num_val_windows=3,
    refit_every_n_windows=1,
    refit_full=True
    # excluded_model_types=["TemporalFusionTransformer", "PatchTST"],
    
#     hyperparameters={
# #       "SeasonalNaive": {"n_jobs": 6},
# #       "Naive": {"n_jobs": 6},
#       "AutoETS": {"n_jobs": 6},
# #       "DynamicOptimizedTheta": {"n_jobs": 6},
#       "RecursiveTabular": {"n_jobs": 6},
# #       "AutoCES": {"n_jobs": 6},
#         "AutoARIMA": {"n_jobs": 6},
# # #       "ADIDA": {},
# # #       "IMAPA": {},
# # #       "DLinear": {},
# # #       "SimpleFeedForward": {},
#       "DeepAR": {},
#         },
#     hyperparameter_tune_kwargs={
# #     "num_trials": 5,
# #     "scheduler": "local",
# #     "searcher": "random",
#     "n_jobs": -1
# },
)

Beginning AutoGluon training... Time limit = 21600s
AutoGluon will save models to 'AutogluonModels\ag-20240227_194528'
AutoGluon Version:  1.0.0
Python Version:     3.10.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
GPU Count:          0
Memory Avail:       3.76 GB / 15.42 GB (24.4%)
Disk Space Avail:   72.10 GB / 476.34 GB (15.1%)
Setting presets to: best_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MAE,
 'freq': 'M',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 3,
 'prediction_length': 3,
 'quantile_levels': [0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.4, 0.5],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': True,
 'target': 'net_payment_count',
 'time_limit': 21600,
 'verbosity': 2}

Provided train_data has 54454 rows, 2196 time series. Median time series length is 25 (min=14, max=33). 

Provided dataset contains following columns:
	target:   

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x22246c7a7d0>

In [27]:
predictor.leaderboard()

Unnamed: 0,model,score_val,pred_time_val,fit_time_marginal,fit_order
0,WeightedEnsemble,-102.914,12.905,7.759,12
1,AutoETS,-105.064,8.434,34.213,4
2,DynamicOptimizedTheta,-106.289,3.325,15.982,5
3,PatchTST,-109.328,0.872,228.871,11
4,DeepAR,-110.062,3.604,329.017,9
5,RecursiveTabular,-111.061,0.274,113.113,7
6,TemporalFusionTransformer,-113.906,1.744,485.439,10
7,AutoARIMA,-114.444,62.814,159.705,6
8,DirectTabular,-186.86,0.229,5.256,8
9,CrostonSBA,-193.685,1.166,10.113,2


In [28]:
from autogluon.timeseries.utils.forecast import get_forecast_horizon_index_ts_dataframe

future_index = get_forecast_horizon_index_ts_dataframe(model_2021_up_data, prediction_length=3)
future_timestamps = future_index.get_level_values("timestamp")
known_covariates = add_date_features(pd.DataFrame(index=future_index), merged_dates)

In [29]:
predictions_up = predictor.predict(model_2021_up_data, known_covariates=known_covariates, model='WeightedEnsemble_FULL')

In [30]:
results = predictions_up.copy().reset_index()
results['id'] = results['timestamp'].dt.strftime('%Y%m') + results['item_id']

# Select the 'id' and 'mean' columns and rename 'mean' to 'net_payment_count'
model_2021_up_sub = results[['id', '0.3']].rename(columns={'0.3': 'net_payment_count'})

model_2021_up_sub.to_csv('model_2021_up_sub.csv', index=False)

In [31]:
churn.to_csv('churn_2021.csv', index=False)

In [32]:
static_features_df = static_df.drop_duplicates()

In [33]:
model_2021_df_down_data = TimeSeriesDataFrame.from_data_frame(
    model_2021_df_down,
    id_column="merchant_id",
    timestamp_column="month_id",
    static_features_df=static_features_df
)
model_2021_df_down_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,net_payment_count
item_id,timestamp,Unnamed: 2_level_1
merchant_10000,2022-05-01,3
merchant_10000,2022-06-01,4
merchant_10000,2022-12-01,10
merchant_10000,2023-01-01,6
merchant_10000,2023-02-01,4


In [34]:
model_2021_df_down_data = model_2021_df_down_data.convert_frequency(freq="M")
model_2021_df_down_data['net_payment_count'] = model_2021_df_down_data['net_payment_count'].fillna(0)
model_2021_df_down_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,net_payment_count
item_id,timestamp,Unnamed: 2_level_1
merchant_10000,2022-05-31,3.0
merchant_10000,2022-06-30,4.0
merchant_10000,2022-07-31,0.0
merchant_10000,2022-08-31,0.0
merchant_10000,2022-09-30,0.0


In [35]:
import holidays

timestamps = model_2021_df_down_data.index.get_level_values("timestamp")
country_holidays = holidays.country_holidays(
    country="TR",
    years=range(timestamps.min().year, timestamps.max().year + 1),
)
pd.Series(country_holidays).sort_index()

2021-01-01                                       New Year's Day
2021-04-23              National Sovereignty and Children's Day
2021-05-01                                           Labour Day
2021-05-13                                        Ramadan Feast
2021-05-14                                Ramadan Feast Holiday
2021-05-15                                Ramadan Feast Holiday
2021-05-19       Commemoration of Ataturk, Youth and Sports Day
2021-07-15                     Democracy and National Unity Day
2021-07-20                                      Sacrifice Feast
2021-07-21                              Sacrifice Feast Holiday
2021-07-22                              Sacrifice Feast Holiday
2021-07-23                              Sacrifice Feast Holiday
2021-08-30                                          Victory Day
2021-10-29                                         Republic Day
2022-01-01                                       New Year's Day
2022-04-23              National Soverei

In [36]:
import datetime
# Yukarıda bahsettiğim grafikteki minimum-maksimum değerleri:
custom_dates = {
    datetime.date(2021, 1, 1): "Lowest",
    datetime.date(2021, 5, 1): "Highest",
    datetime.date(2021, 7, 1): "Lowest",
    datetime.date(2021, 11, 1): "Highest",
    datetime.date(2022, 1, 1): "Lowest",
    datetime.date(2023, 3, 1): "Lowest",
}
'''
    datetime.date(2020, 4, 1): "Corona",
    datetime.date(2020, 5, 1): "Corona",
    datetime.date(2020, 6, 1): "Corona",
    # datetime.date(2021, 4, 1): "Corona",
    # datetime.date(2021, 5, 1): "Corona",
    datetime.date(2023, 3, 1): "Deprem", Minimum degerlerden biri buraya tekabul ediyor
    datetime.date(2023, 5, 1): "Secim",
    datetime.date(2023, 6, 1): "Secim",
'''

'\n    datetime.date(2020, 4, 1): "Corona",\n    datetime.date(2020, 5, 1): "Corona",\n    datetime.date(2020, 6, 1): "Corona",\n    # datetime.date(2021, 4, 1): "Corona",\n    # datetime.date(2021, 5, 1): "Corona",\n    datetime.date(2023, 3, 1): "Deprem", Minimum degerlerden biri buraya tekabul ediyor\n    datetime.date(2023, 5, 1): "Secim",\n    datetime.date(2023, 6, 1): "Secim",\n'

In [37]:
merged_dates = country_holidays.copy()
for date, event in custom_dates.items():
    if date in merged_dates:
        merged_dates[date] += ", " + event
    else:
        merged_dates[date] = event

In [38]:
def add_date_features(
    ts_df: TimeSeriesDataFrame, 
    country_holidays: dict,
    include_individual_holidays: bool = True,
    include_holiday_indicator: bool = True,
) -> TimeSeriesDataFrame:
    """Tatil günleri ve bahsedilen maksimum-minimum degerleri ekle"""
    ts_df = ts_df.copy()
    timestamps = ts_df.index.get_level_values("timestamp")
    country_holidays_df = pd.get_dummies(pd.Series(country_holidays)).astype(float)
    holidays_df = country_holidays_df.reindex(timestamps.date).fillna(0)
    if include_individual_holidays:
        ts_df[holidays_df.columns] = holidays_df.values
    if include_holiday_indicator:
        ts_df["Holiday"] = holidays_df.max(axis=1).values
    return ts_df

In [39]:
model_2021_df_down_data = add_date_features(model_2021_df_down_data, merged_dates)

In [40]:
holiday_columns = model_2021_df_down_data.columns.to_list()
holiday_columns.remove('net_payment_count')

In [41]:
from autogluon.common import space

predictor = TimeSeriesPredictor(
    prediction_length=3,
    target="net_payment_count",
    eval_metric="MAE",
    freq='M',
    # quantile_levels=[0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7],
#     known_covariates_names=["month", "year"]
)

predictor.fit(
    model_2021_df_down_data,
    presets="best_quality",
    time_limit= 3600 * 6,
    excluded_model_types=["TemporalFusionTransformer", "AutoARIMA"],
    num_val_windows=1,
    refit_every_n_windows=1,
    refit_full=True,
    
#     hyperparameters={
#     #   "SeasonalNaive": {"n_jobs": -1},
#       "Naive": {"n_jobs": 6},
#       "AutoETS": {"n_jobs": 6},
#       "DynamicOptimizedTheta": {"n_jobs": 6},
#       "RecursiveTabular": {"n_jobs": 6},
#       "AutoCES": {"n_jobs": 6},
# #       "ADIDA": {},
# #       "IMAPA": {},
# #       "DLinear": {},
# #       "SimpleFeedForward": {},
# #       "DeepAR": {},
#         },
#     hyperparameter_tune_kwargs={
# #     "num_trials": 5,
# #     "scheduler": "local",
# #     "searcher": "random",
#     "n_jobs": -1
# },
)

Beginning AutoGluon training... Time limit = 21600s
AutoGluon will save models to 'AutogluonModels\ag-20240227_201121'
AutoGluon Version:  1.0.0
Python Version:     3.10.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
GPU Count:          0
Memory Avail:       5.36 GB / 15.42 GB (34.7%)
Disk Space Avail:   71.79 GB / 476.34 GB (15.1%)
Setting presets to: best_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MAE,
 'excluded_model_types': ['TemporalFusionTransformer', 'AutoARIMA'],
 'freq': 'M',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 3,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': True,
 'target': 'net_payment_count',
 'time_limit': 21600,
 'verbosity': 2}

Provided train_data has 19783 rows, 1362 time series. Median time series length is 13 (min=10

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x22253c7d750>

In [42]:
predictor.leaderboard()

Unnamed: 0,model,score_val,pred_time_val,fit_time_marginal,fit_order
0,WeightedEnsemble,-16.458,25.345,1.664,10
1,AutoETS,-16.876,24.766,0.032,4
2,PatchTST,-17.14,0.579,36.406,9
3,DeepAR,-18.513,2.075,74.84,8
4,DirectTabular,-21.706,0.144,1.109,7
5,NPTS,-26.864,1.184,0.034,3
6,SeasonalNaive,-28.874,3.957,0.032,1
7,CrostonSBA,-32.334,7.874,0.031,2
8,DynamicOptimizedTheta,-32.96,11.743,0.031,5
9,RecursiveTabular,-42.628,0.221,12.64,6


In [43]:
from autogluon.timeseries.utils.forecast import get_forecast_horizon_index_ts_dataframe

future_index = get_forecast_horizon_index_ts_dataframe(model_2021_df_down_data, prediction_length=3)
future_timestamps = future_index.get_level_values("timestamp")
known_covariates = add_date_features(pd.DataFrame(index=future_index), merged_dates)

In [44]:
predictions = predictor.predict(model_2021_df_down_data, known_covariates=known_covariates, model='WeightedEnsemble_FULL')

In [45]:
results = predictions.copy().reset_index()
results['id'] = results['timestamp'].dt.strftime('%Y%m') + results['item_id']

# Select the 'id' and 'mean' columns and rename 'mean' to 'net_payment_count'
model_2021_down_sub = results[['id', '0.3']].rename(columns={'0.3': 'net_payment_count'})

model_2021_down_sub.to_csv('model_2021_down_sub.csv', index=False)

In [46]:
predictions.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
merchant_10000,2023-10-31,2.853,-0.92,0.397,1.329,2.118,2.853,3.588,4.378,5.31,6.626
merchant_10000,2023-11-30,2.83,-0.923,0.393,1.32,2.102,2.83,3.558,4.341,5.267,6.584
merchant_10000,2023-12-31,2.858,-0.926,0.4,1.334,2.123,2.858,3.592,4.382,5.315,6.641
merchant_10036,2023-10-31,0.931,-1.422,-0.603,-0.022,0.471,0.931,1.391,1.884,2.466,3.284
merchant_10036,2023-11-30,1.0,-1.377,-0.548,0.039,0.536,1.0,1.463,1.96,2.547,3.376
merchant_10036,2023-12-31,1.094,-1.346,-0.497,0.106,0.618,1.094,1.57,2.082,2.685,3.534
merchant_10072,2023-10-31,83.106,50.314,62.533,70.537,77.09,83.106,89.123,95.675,103.68,115.899
merchant_10072,2023-11-30,84.012,42.986,58.377,68.37,76.528,84.012,91.496,99.654,109.647,125.037
merchant_10072,2023-12-31,82.629,35.374,52.918,64.453,73.922,82.629,91.336,100.806,112.34,129.884
merchant_10073,2023-10-31,21.853,0.423,8.148,13.413,17.798,21.853,25.909,30.294,35.558,43.283


In [47]:
results = predictions.copy().reset_index()
results['id'] = results['timestamp'].dt.strftime('%Y%m') + results['item_id']

# Select the 'id' and 'mean' columns and rename 'mean' to 'net_payment_count'
model_2021_down = results[['id', '0.4']].rename(columns={'0.4': 'net_payment_count'})

model_2021_down.to_csv('model_2021_down_sub.csv', index=False)

In [48]:
model_2021_down.isnull().sum(), model_2021_up_sub.isnull().sum()

(id                   0
 net_payment_count    0
 dtype: int64,
 id                   0
 net_payment_count    0
 dtype: int64)