In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 100)
pd.set_option('display.max_rows', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train.csv")

In [3]:
static_df = train[['merchant_id', 'mcc_id', 'settlement_period', 'working_type', 'merchant_segment']]
# 
train = train[['merchant_id', 'month_id', 'net_payment_count']]
train = train.sort_values(by=['merchant_id', 'month_id']).reset_index(drop=True)
train['month_id'] = pd.to_datetime(train['month_id'], format='%Y%m')

In [4]:
def filter_by_multiple_gaps(df, threshold):
    # month_id sütununu datetime'a çevirme
    
    result_df = pd.DataFrame()  # Sonuçları saklamak için boş bir dataframe
    
    for merchant_id, group in df.groupby('merchant_id'):
        group = group.sort_values(by='month_id')  # Her grubu tarihe göre sıralama
        valid_indices = []  # Geçerli indeksleri saklamak için boş bir liste
        
        prev_date = None
        for index, row in group.iterrows():
            if prev_date is not None:
                date_diff = (row['month_id'] - prev_date).days
                if date_diff < threshold:
                    # Eğer fark belirlenen eşikten küçükse, bu indeksi sakla
                    valid_indices.append(index)
                else:
                    # Eşikten büyük bir fark bulunduğunda, geçerli indeksleri sıfırla ve bu indeksi ekle
                    valid_indices = [index]
            else:
                # İlk satır her zaman geçerli olarak kabul edilir
                valid_indices.append(index)
            prev_date = row['month_id']
        
        # Geçerli indekslere sahip satırları sonuç DataFrame'ine ekle
        result_df = pd.concat([result_df, group.loc[valid_indices]])
    
    return result_df.reset_index(drop=True)

train = filter_by_multiple_gaps(train, 240)

In [5]:
train[train['merchant_id'] == 'merchant_37587']

Unnamed: 0,merchant_id,month_id,net_payment_count
122846,merchant_37587,2023-06-01,10
122847,merchant_37587,2023-07-01,40


In [6]:
# Son transaction ve ilk gözlem tarihlerini bulma
last_transaction = train.groupby('merchant_id')['month_id'].max()
first_observation = train.groupby('merchant_id')['month_id'].min()

# Model DataFrames
model_2020_df = train[train['merchant_id'].isin(first_observation[first_observation <= '2020-12-01'].index)]
# model_2021_df = train[train['merchant_id'].isin(first_observation[(first_observation > '2020-09-01') & (first_observation <= '2022-09-01')].index)]
# model_2022_df = train[train['merchant_id'].isin(first_observation[(first_observation > '2021-09-01') & (first_observation <= '2022-09-01')].index)]
# model_2023_df = train[train['merchant_id'].isin(first_observation[first_observation > '2022-09-01'].index)]

# Her merchant_id için gözlem sayısını say
merchant_counts = model_2020_df['merchant_id'].value_counts()

# Her merchant_id için en son gözlem tarihini bul
last_observation = model_2020_df.groupby('merchant_id')['month_id'].max()

# Gözlem sayısı 3'ten az olan veya son gözlem tarihi 202301'den düşük olan merchant_id'leri bul
filtered_merchant_ids = merchant_counts[(last_observation <= '2023-06-01')].index
# (merchant_counts <= 3) | 

# Bu merchant_id'leri no_model DataFrame'ine ata
churn_df = model_2020_df[model_2020_df['merchant_id'].isin(filtered_merchant_ids)]

# Diğer verileri updated_train DataFrame'ine ata
model_2020_df = model_2020_df[~model_2020_df['merchant_id'].isin(filtered_merchant_ids)]

# Set ve DataFrames'in boyutlarını kontrol etme
churn_df.shape, model_2020_df.shape

((65517, 3), (103043, 3))

In [7]:
# import pandas as pd
# import numpy as np

# def detect_and_cap_outliers(df):
#     df['rolling_mean'] = df['net_payment_count'].rolling(window=10, min_periods=1).mean()
#     df['rolling_std'] = df['net_payment_count'].rolling(window=10, min_periods=1).std()  # ddof=0 for population std
#     df['lower_limit'] = df['rolling_mean'] - 5 * df['rolling_std']
#     df['upper_limit'] = df['rolling_mean'] + 5 * df['rolling_std']
    
#     # Aykırı değerleri tespit
#     df['is_outlier'] = (df['net_payment_count'] < df['lower_limit']) | (df['net_payment_count'] > df['upper_limit'])
    
#     # Aykırı değerleri baskıla
#     df['net_payment_count'] = np.where(df['net_payment_count'] < df['lower_limit'], df['lower_limit'],
#                                               np.where(df['net_payment_count'] > df['upper_limit'], df['upper_limit'],
#                                                        df['net_payment_count']))
#     return df

# # Her bir merchant_id için aykırı değerleri tespit etmek, baskılamak ve sonuçları görmek
# merchant_ids = model_2020_df['merchant_id'].unique()
# capped_results = []  # Sonuçları saklamak için boş bir liste

# for merchant_id in merchant_ids:
#     merchant_df = model_2020_df[model_2020_df['merchant_id'] == merchant_id].copy()
#     capped_df = detect_and_cap_outliers(merchant_df)
#     capped_results.append(capped_df)

# # Sonuçların birleştirilmesi
# model_2020_df = pd.concat(capped_results).reset_index(drop=True)
# model_2020_df = model_2020_df[['merchant_id', 'month_id' ,'net_payment_count']]

# # İlk birkaç sonucu göster
# model_2020_df.head()

In [8]:
model_2020_df.net_payment_count.mean()

753.0110051143697

In [9]:
# Gözlem sayılarını hesaplama
merchant_observation_counts = model_2020_df['merchant_id'].value_counts()

# 11'den fazla gözlem içeren merchant'ları belirleme
merchants_more_than_11 = merchant_observation_counts[merchant_observation_counts >= 0].index

# 11'den az gözlem içeren merchant'ları belirleme
merchants_less_than_11 = merchant_observation_counts[merchant_observation_counts < 0].index

# Bu merchant'ların gözlemlerini ilgili DataFrame'lere ayırma
model_2020_df_up = model_2020_df[model_2020_df['merchant_id'].isin(merchants_more_than_11)]
model_2020_df_down = model_2020_df[model_2020_df['merchant_id'].isin(merchants_less_than_11)]

# Sonuçların boyutlarını kontrol etme
model_2020_df_up.shape, model_2020_df_down.shape

((103043, 3), (0, 3))

In [10]:
model_2020_df['merchant_id'].value_counts()

merchant_id
merchant_41252    45
merchant_4046     45
merchant_780      45
merchant_41175    45
merchant_7773     45
                  ..
merchant_35547    12
merchant_27265    12
merchant_52759    12
merchant_26183    10
merchant_10239    10
Name: count, Length: 2736, dtype: int64

In [11]:
#AZ GÖZLEMLİLERLE İLGİLEN

In [12]:
unique_merchant_ids = churn_df['merchant_id'].unique()

# Her merchant_id için 2023-10, 2023-11, ve 2023-12 tarihleri için id oluştur
submission_entries = []
for merchant_id in unique_merchant_ids:
    for month in ['10', '11', '12']:
        submission_id = f'2023{month}{merchant_id}'
        submission_entries.append([submission_id, merchant_id, None])

# sub_no_model DataFrame'ini oluştur
churn_2020 = pd.DataFrame(submission_entries, columns=['id', 'merchant_id', 'net_payment_count'])

churn_2020.head(20)

Unnamed: 0,id,merchant_id,net_payment_count
0,202310merchant_10001,merchant_10001,
1,202311merchant_10001,merchant_10001,
2,202312merchant_10001,merchant_10001,
3,202310merchant_10005,merchant_10005,
4,202311merchant_10005,merchant_10005,
5,202312merchant_10005,merchant_10005,
6,202310merchant_10008,merchant_10008,
7,202311merchant_10008,merchant_10008,
8,202312merchant_10008,merchant_10008,
9,202310merchant_10013,merchant_10013,


In [13]:
churn_2020['net_payment_count'] = 0
churn_2020 = churn_2020[['id', 'net_payment_count']]
churn_2020.to_csv('churn_2020.csv', index=False)
churn_2020.head(3)

Unnamed: 0,id,net_payment_count
0,202310merchant_10001,0
1,202311merchant_10001,0
2,202312merchant_10001,0


In [14]:
def filler(df):
    # Define the target date
    target_date = pd.to_datetime('2023-09-01')

    # Find merchants that don't have a record on the target date
    merchants_missing_target_date = df[~df['merchant_id'].isin(
        df[df['month_id'] == target_date]['merchant_id']
    )]['merchant_id'].unique()

    # Get the first record of each merchant to preserve the static features
    first_records_per_merchant = df[df['merchant_id'].isin(merchants_missing_target_date)].groupby('merchant_id').first().reset_index()

    # Create missing records for the target date
    missing_records = first_records_per_merchant.copy()
    missing_records['month_id'] = target_date
    missing_records['net_payment_count'] = 0

    # Append the missing records to the original DataFrame
    df = pd.concat([df, missing_records], ignore_index=True)

    # Sort the updated DataFrame
    df.sort_values(by=['merchant_id', 'month_id'], inplace=True)
    
    return df

model_2020_df_up = filler(model_2020_df_up) 
model_2020_df_down = filler(model_2020_df_down) 

In [15]:
# model_2021_df[model_2021_df['merchant_id'] == 'merchant_55336']

In [16]:
# model_2021_df['merchant_id'].value_counts()

In [17]:
# model_2020_df = model_2020_df[model_2020_df['month_id'] > '2020-12-01']

In [18]:
static_features_df = static_df.drop_duplicates()

In [19]:
# from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

# model_2020_df_down_data = TimeSeriesDataFrame.from_data_frame(
#     model_2020_df_down,
#     id_column="merchant_id",
#     timestamp_column="month_id",
#     static_features_df=static_features_df
# )
# model_2020_df_down_data.head()

In [20]:
# model_2020_df_down_data = model_2020_df_down_data.convert_frequency(freq="M")

In [21]:
# model_2020_df_down_data['net_payment_count'] = model_2020_df_down_data['net_payment_count'].fillna(0)

In [22]:
# model_2020_df_down_data.head()

In [23]:
# from autogluon.common import space

# predictor = TimeSeriesPredictor(
#     prediction_length=3,
#     target="net_payment_count",
#     eval_metric="MAE",
#     freq='M',
#     quantile_levels=[0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7],
# #     known_covariates_names=["month", "year"]
# )

# predictor.fit(
#     model_2020_df_down_data,
#     presets="best_quality",
#     time_limit= 3600 * 6,
#     # excluded_model_types=["TemporalFusionTransformer", "DeepAR"],
    
#     hyperparameters={
# #     #   "SeasonalNaive": {"n_jobs": -1},
# #       "Naive": {"n_jobs": 6},
# #       "AutoETS": {"n_jobs": 6},
# #       "DynamicOptimizedTheta": {"n_jobs": 6},
# #       "RecursiveTabular": {"n_jobs": 6},
# #       "AutoCES": {"n_jobs": 6},
# # #       "ADIDA": {},
# # #       "IMAPA": {},
# # #       "DLinear": {},
# # #       "SimpleFeedForward": {},
#       "DeepAR": {"n_jobs": 6},
#       "PatchTST": {"n_jobs": 6},
#         },
# #     hyperparameter_tune_kwargs={
# # #     "num_trials": 5,
# # #     "scheduler": "local",
# # #     "searcher": "random",
# #     "n_jobs": -1
# # },
# )

In [24]:
# predictor.leaderboard()

In [25]:
# predictions = predictor.predict(model_2020_df_down_data, model='WeightedEnsemble')

In [26]:
# results = predictions.copy().reset_index()
# results['id'] = results['timestamp'].dt.strftime('%Y%m') + results['item_id']

# # Select the 'id' and 'mean' columns and rename 'mean' to 'net_payment_count'
# model_2020_down_sub = results[['id', '0.5']].rename(columns={'0.5': 'net_payment_count'})

In [27]:
# model_2020_down_sub.to_csv('model_2020_down_sub.csv', index=False)

In [28]:
static_features_df = static_df.drop_duplicates()

In [29]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

model_2020_df_up_data = TimeSeriesDataFrame.from_data_frame(
    model_2020_df_up,
    id_column="merchant_id",
    timestamp_column="month_id",
    static_features_df=static_features_df
)
model_2020_df_up_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,net_payment_count
item_id,timestamp,Unnamed: 2_level_1
merchant_10057,2020-06-01,3
merchant_10057,2021-01-01,4
merchant_10057,2021-02-01,6
merchant_10057,2021-03-01,4
merchant_10057,2021-04-01,3


In [30]:
model_2020_df_up_data = model_2020_df_up_data.convert_frequency(freq="M")
model_2020_df_up_data['net_payment_count'] = model_2020_df_up_data['net_payment_count'].fillna(0)
model_2020_df_up_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,net_payment_count
item_id,timestamp,Unnamed: 2_level_1
merchant_10057,2020-06-30,3.0
merchant_10057,2020-07-31,0.0
merchant_10057,2020-08-31,0.0
merchant_10057,2020-09-30,0.0
merchant_10057,2020-10-31,0.0


In [31]:
import holidays

timestamps = model_2020_df_up_data.index.get_level_values("timestamp")
country_holidays = holidays.country_holidays(
    country="TR",
    years=range(timestamps.min().year, timestamps.max().year + 1),
)
pd.Series(country_holidays).sort_index()

2020-01-01                                       New Year's Day
2020-04-23              National Sovereignty and Children's Day
2020-05-01                                           Labour Day
2020-05-19       Commemoration of Ataturk, Youth and Sports Day
2020-05-24                                        Ramadan Feast
2020-05-25                                Ramadan Feast Holiday
2020-05-26                                Ramadan Feast Holiday
2020-07-15                     Democracy and National Unity Day
2020-07-31                                      Sacrifice Feast
2020-08-01                              Sacrifice Feast Holiday
2020-08-02                              Sacrifice Feast Holiday
2020-08-03                              Sacrifice Feast Holiday
2020-08-30                                          Victory Day
2020-10-29                                         Republic Day
2021-01-01                                       New Year's Day
2021-04-23              National Soverei

In [32]:
import datetime
# Yukarıda bahsettiğim grafikteki minimum-maksimum değerleri:
custom_dates = {
    datetime.date(2021, 1, 1): "Lowest",
    datetime.date(2021, 5, 1): "Highest",
    datetime.date(2021, 7, 1): "Lowest",
    datetime.date(2021, 11, 1): "Highest",
    datetime.date(2022, 1, 1): "Lowest",
    datetime.date(2023, 3, 1): "Lowest",
}
'''
    datetime.date(2020, 4, 1): "Corona",
    datetime.date(2020, 5, 1): "Corona",
    datetime.date(2020, 6, 1): "Corona",
    # datetime.date(2021, 4, 1): "Corona",
    # datetime.date(2021, 5, 1): "Corona",
    datetime.date(2023, 3, 1): "Deprem", Minimum degerlerden biri buraya tekabul ediyor
    datetime.date(2023, 5, 1): "Secim",
    datetime.date(2023, 6, 1): "Secim",
'''

'\n    datetime.date(2020, 4, 1): "Corona",\n    datetime.date(2020, 5, 1): "Corona",\n    datetime.date(2020, 6, 1): "Corona",\n    # datetime.date(2021, 4, 1): "Corona",\n    # datetime.date(2021, 5, 1): "Corona",\n    datetime.date(2023, 3, 1): "Deprem", Minimum degerlerden biri buraya tekabul ediyor\n    datetime.date(2023, 5, 1): "Secim",\n    datetime.date(2023, 6, 1): "Secim",\n'

In [33]:
merged_dates = country_holidays.copy()
for date, event in custom_dates.items():
    if date in merged_dates:
        merged_dates[date] += ", " + event
    else:
        merged_dates[date] = event

In [34]:
def add_date_features(
    ts_df: TimeSeriesDataFrame, 
    country_holidays: dict,
    include_individual_holidays: bool = True,
    include_holiday_indicator: bool = True,
) -> TimeSeriesDataFrame:
    """Tatil günleri ve bahsedilen maksimum-minimum degerleri ekle"""
    ts_df = ts_df.copy()
    timestamps = ts_df.index.get_level_values("timestamp")
    country_holidays_df = pd.get_dummies(pd.Series(country_holidays)).astype(float)
    holidays_df = country_holidays_df.reindex(timestamps.date).fillna(0)
    if include_individual_holidays:
        ts_df[holidays_df.columns] = holidays_df.values
    if include_holiday_indicator:
        ts_df["Holiday"] = holidays_df.max(axis=1).values
    return ts_df

In [36]:
model_2020_df_up_data = add_date_features(model_2020_df_up_data, merged_dates)

In [37]:
holiday_columns = model_2020_df_up_data.columns.to_list()
holiday_columns.remove('net_payment_count')

In [38]:
from autogluon.common import space

predictor = TimeSeriesPredictor(
    prediction_length=3,
    target="net_payment_count",
    eval_metric="MAE",
    freq='M',
    quantile_levels=[0.3, 0.31, 0.32, 0.33, 0.34 ,0.35, 0.4, 0.5],
#     known_covariates_names=["month", "year"]
)

predictor.fit(
    model_2020_df_up_data,
    presets="best_quality",
    time_limit= 3600 * 6,
    num_val_windows=4,
    refit_every_n_windows=1,
    refit_full=True,
    # excluded_model_types=["CrostonSBA", "NPTS", "DirectTabular", 
    #                      "TemporalFusionTransformer", "PatchTST"],
    
    # hyperparameters={
    #   "AutoARIMA": {"n_jobs": 6},
    # # #   "SeasonalNaive": {"n_jobs": 6},
    # # #   "Naive": {"n_jobs": 6},
    # #   "AutoETS": {"n_jobs": 6},
    #   "DynamicOptimizedTheta": {"n_jobs": 6},
    #   "RecursiveTabular": {"n_jobs": 6},
    # #   "AutoCES": {"n_jobs": 6},
    # # #   "ADIDA": {},
    # # #   "IMAPA": {},
    # # #   "DLinear": {},
    # # #   "SimpleFeedForward": {},
    #   "DeepAR": {"n_jobs": 6},
    #     },
    # hyperparameter_tune_kwargs={
# #     "num_trials": 5,
# #     "scheduler": "local",
# #     "searcher": "random",
#     "n_jobs": -1
# },
)

Beginning AutoGluon training... Time limit = 21600s
AutoGluon will save models to 'AutogluonModels\ag-20240227_185725'
AutoGluon Version:  1.0.0
Python Version:     3.10.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
GPU Count:          0
Memory Avail:       6.43 GB / 15.42 GB (41.7%)
Disk Space Avail:   71.79 GB / 476.34 GB (15.1%)
Setting presets to: best_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MAE,
 'freq': 'M',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 4,
 'prediction_length': 3,
 'quantile_levels': [0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.4, 0.5],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': True,
 'target': 'net_payment_count',
 'time_limit': 21600,
 'verbosity': 2}

Provided train_data has 113842 rows, 2736 time series. Median time series length is 43 (min=34, max=45). 

Provided dataset contains following columns:
	target:  

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x23a13213190>

In [39]:
from autogluon.timeseries.utils.forecast import get_forecast_horizon_index_ts_dataframe

future_index = get_forecast_horizon_index_ts_dataframe(model_2020_df_up_data, prediction_length=3)
future_timestamps = future_index.get_level_values("timestamp")
known_covariates = add_date_features(pd.DataFrame(index=future_index), merged_dates)

In [40]:
predictions_up = predictor.predict(model_2020_df_up_data, known_covariates=known_covariates, model='WeightedEnsemble_FULL')

In [41]:
results = predictions_up.copy().reset_index()
results['id'] = results['timestamp'].dt.strftime('%Y%m') + results['item_id']

# Select the 'id' and 'mean' columns and rename 'mean' to 'net_payment_count'
model_2020_up_sub = results[['id', '0.35']].rename(columns={'0.35': 'net_payment_count'})

model_2020_up_sub.to_csv('model_2020_up_sub.csv', index=False)

In [None]:
model_2020_up_sub.isnull().sum()

id                   0
net_payment_count    0
dtype: int64

In [None]:
# model_2020_down_sub.isnull().sum()