In [1]:
import pandas as pd
import numpy as np
import gc
import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 100)
pd.set_option('display.max_rows', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = "."#"/kaggle/input/gdz-elektrik-datathon"
train = pd.read_csv(f'{path}/train.csv')
test = pd.read_csv(f'{path}/test.csv')
weather = pd.read_csv(f'{path}/weather.csv')
holidays = pd.read_csv(f'{path}/holidays.csv')
submission = pd.read_csv(f'{path}/sample_submission.csv')

In [3]:
train = train[train['tarih'] >= '2021-01-01']
#Son gün gözleminin eklenmesi

# Yeni tarih
new_date = '2024-01-31'

# Tüm ilçeleri al (örnekte sadece 'manisa-akhisar' var, ama gerçek veri setinizde daha fazla ilçe olabilir)
unique_districts = train['ilce'].unique()

# Yeni tarihte gözlemi olmayan ilçeler için kontrol et
missing_districts = [district for district in unique_districts if not ((train['ilce'] == district) & (train['tarih'] == new_date)).any()]

# Eksik ilçeler için yeni satırlar oluştur
missing_rows = pd.DataFrame({
    'tarih': [new_date] * len(missing_districts),
    'ilce': missing_districts,
    'bildirimsiz_sum': [0] * len(missing_districts)
})

# Eksik satırları orijinal DataFrame'e ekle
train = pd.concat([train, missing_rows], ignore_index=True)

In [4]:
if False:
    train['tarih'] = pd.to_datetime(train['tarih'])
    train['year'] = train['tarih'].apply(lambda x:x.year)
    train['month'] = train['tarih'].apply(lambda x:x.month)
    
    to_norm = train.groupby(['year', 'month'])['bildirimsiz_sum'].mean().reset_index().rename(columns={'bildirimsiz_sum': 'bildirimsiz_sum_mean'})
    inverse_scale = to_norm[(to_norm['year']==2023) & (to_norm['month']==12)]
    
    train = train.merge(to_norm[['bildirimsiz_sum_mean', 'year', 'month']], on=['year', 'month'])
    train['bildirimsiz_sum'] = train['bildirimsiz_sum'] / train['bildirimsiz_sum_mean']
    del train['bildirimsiz_sum_mean']
    
    train['bildirimsiz_sum'] = train['bildirimsiz_sum'] * inverse_scale.iloc[0]['bildirimsiz_sum_mean']
    
    del train['year']
    del train['month']
    
    train = train.fillna(0)

In [5]:
# 'tarih' sütununu datetime türüne çeviriyoruz
train['tarih'] = pd.to_datetime(train['tarih'])

# İlçelerin benzersiz listesi
unique_districts = train['ilce'].unique()

# Her ilçe için tarih aralığını buluyoruz ve eksik günleri dolduruyoruz
all_frames = []
for district in unique_districts:
    # İlçeye özel DataFrame
    district_df = train[train['ilce'] == district]
    
    # İlçenin min ve max tarihleri arasında tüm tarihleri oluşturuyoruz
    min_date = district_df['tarih'].min()
    max_date = district_df['tarih'].max()
    all_dates = pd.date_range(start=min_date, end=max_date)
    
    # Bu tarihlerle yeni bir DataFrame oluşturuyoruz
    full_df = pd.DataFrame(all_dates, columns=['tarih'])
    full_df['ilce'] = district  # Tüm tarihler için ilçe ismini atıyoruz
    
    # Oluşturduğumuz DataFrame ile ilçenin orijinal DataFrame'ini birleştiriyoruz
    merged_df = pd.merge(full_df, district_df, on=['tarih', 'ilce'], how='left')
    
    # Eksik verileri 0 ile dolduruyoruz
    merged_df['bildirimsiz_sum'].fillna(0, inplace=True)
    merged_df['bildirimli_sum'].fillna(0, inplace=True)
    
    # Sonuçları birleştirmek için
    all_frames.append(merged_df)

# Tüm ilçeleri birleştiriyoruz
complete_df = pd.concat(all_frames)

train = complete_df.copy()

### Fourier Cycles

In [6]:

if True:
    # train ve test DataFrame'lerinizin 'tarih' sütununu datetime türüne dönüştürün
    train['tarih'] = pd.to_datetime(train['tarih'])
    test['tarih'] = pd.to_datetime(test['tarih'])

    cycle_df = pd.DataFrame({'tarih': list(set(list(train['tarih'].unique())).union(set(list(test['tarih'].unique()))))})
    cycle_df['pd_dt_tarih'] = pd.to_datetime(cycle_df['tarih'])
    cycle_df = cycle_df.sort_values(by='pd_dt_tarih')

    cycle_df['daycumsum'] = 1
    cycle_df['daycumsum'] = cycle_df['daycumsum'].cumsum()

    cyclic_loops = list(map(int, [25., 111.,  74.,  22., 122.,  34.,  51.,  61.]))

    for cycle in cyclic_loops:
        cycle_df[f"cycle_{cycle}"] = cycle_df["daycumsum"] % cycle

    del cycle_df['daycumsum']
    del cycle_df['pd_dt_tarih']

    train = train.merge(cycle_df, on='tarih', how='left')
    test = test.merge(cycle_df, on='tarih', how='left')

### Holidays

In [7]:
# Tek haneli ay ve gün değerlerine 0 ekleyerek düzenliyoruz ve yıl-ay-gün formatında birleştiriyoruz.
holidays['Ay'] = holidays['Ay'].apply(lambda x: f'{x:02d}')
holidays['Gün'] = holidays['Gün'].apply(lambda x: f'{x:02d}')
holidays['Tarih'] = holidays['Yıl'].astype(str) + '-' + holidays['Ay'].astype(str) + '-' + holidays['Gün'].astype(str)

holidays.head()

Unnamed: 0,Yıl,Ay,Gün,Tatil Adı,Tarih
0,2021,1,1,New Year's Day,2021-01-01
1,2021,4,23,National Sovereignty and Children's Day,2021-04-23
2,2021,5,1,Labour Day,2021-05-01
3,2021,5,19,"Commemoration of Ataturk, Youth and Sports Day",2021-05-19
4,2021,7,15,Democracy and National Unity Day,2021-07-15


In [8]:
holidays.drop(columns=['Ay', 'Yıl', 'Gün'], inplace=True)
holidays = holidays.rename(columns={'Tatil Adı': 'Bayram_Flag'})

In [9]:
new_data = {
    "Bayram_Flag": ["Arefe Günü"] * 6,  # Arefe Günü olarak işaretlenmiş 6 yeni kayıt
    "Tarih": ["2021-05-12", "2021-07-19", "2022-05-01", "2022-07-08", "2023-04-20", "2023-06-27"]
}

# Yeni veri seti DataFrame olarak oluştur
new_df = pd.DataFrame(new_data)

# Mevcut DataFrame ile yeni DataFrame'i birleştir
holidays = pd.concat([holidays, new_df], ignore_index=True)

holidays.head()

Unnamed: 0,Bayram_Flag,Tarih
0,New Year's Day,2021-01-01
1,National Sovereignty and Children's Day,2021-04-23
2,Labour Day,2021-05-01
3,"Commemoration of Ataturk, Youth and Sports Day",2021-05-19
4,Democracy and National Unity Day,2021-07-15


In [10]:
# İki bayramın olduğu satırı bulup ikiye ayırma
duplicate_row_index = holidays[holidays['Bayram_Flag'] == "National Sovereignty and Children's Day; Ramadan Feast Holiday* (*estimated)"].index
duplicate_row_tarih = holidays.loc[duplicate_row_index, 'Tarih'].values[0]

# Yeni satırlar oluşturma
new_rows = pd.DataFrame([
    {"Bayram_Flag": "Ramadan Feast Holiday* (*estimated)", "Tarih": duplicate_row_tarih},
    {"Bayram_Flag": "National Sovereignty and Children's Day", "Tarih": duplicate_row_tarih}
])

# Yeni satırları DataFrame'e ekleme ve orijinal birleşik satırı silme
holidays = pd.concat([holidays.drop(duplicate_row_index), new_rows], ignore_index=True)

In [11]:
train['tarih'] = train['tarih'].dt.strftime('%Y-%m-%d')
# test['tarih'] = test['tarih'].dt.strftime('%Y-%m-%d')

In [12]:
# df2'deki tarihleri anahtar olarak kullanarak Bayram_Flag değerlerini df1'e aktarıyoruz
train = train.merge(holidays, left_on='tarih', right_on='Tarih', how='left').drop('Tarih', axis=1)
test = test.merge(holidays, left_on='tarih', right_on='Tarih', how='left').drop('Tarih', axis=1)

# NaN değerleri uygun bir değerle doldurmak isterseniz (örneğin boş string)
train['Bayram_Flag'].fillna('Özel Değil', inplace=True)
test['Bayram_Flag'].fillna('Özel Değil', inplace=True)

### Koordinatlar

In [13]:
weather = weather.rename(columns={'name': 'ilce'})
weather['ilce'] = weather['ilce'].str.lower()
weather['ilce'] = weather['ilce'].str.replace('ızmır', 'izmir')

In [14]:
koordinat = weather[['date', 'ilce', 'lat', 'lon']]

koordinat['date'] = pd.to_datetime(koordinat['date']).dt.date

koordinat.drop_duplicates(inplace=True)

In [15]:
koordinat.head()

Unnamed: 0,date,ilce,lat,lon
0,2021-01-01,manisa-ahmetli,38.618,28.671
24,2021-01-02,manisa-ahmetli,38.618,28.671
48,2021-01-03,manisa-ahmetli,38.618,28.671
72,2021-01-04,manisa-ahmetli,38.618,28.671
96,2021-01-05,manisa-ahmetli,38.618,28.671


In [16]:
train.head()

Unnamed: 0,tarih,ilce,bildirimsiz_sum,bildirimli_sum,Bayram_Flag
0,2021-01-01,izmir-aliaga,5.0,0.0,New Year's Day
1,2021-01-02,izmir-aliaga,13.0,0.0,Özel Değil
2,2021-01-03,izmir-aliaga,4.0,0.0,Özel Değil
3,2021-01-04,izmir-aliaga,9.0,0.0,Özel Değil
4,2021-01-05,izmir-aliaga,2.0,0.0,Özel Değil


In [17]:
unique_coords = koordinat.drop_duplicates(subset=['ilce']).set_index('ilce')[['lat', 'lon']]

# Train DataFrame ile benzersiz koordinatları birleştir
train = train.join(unique_coords, on='ilce')
test = test.join(unique_coords, on='ilce')

In [18]:
train.head()

Unnamed: 0,tarih,ilce,bildirimsiz_sum,bildirimli_sum,Bayram_Flag,lat,lon
0,2021-01-01,izmir-aliaga,5.0,0.0,New Year's Day,38.8,26.971
1,2021-01-02,izmir-aliaga,13.0,0.0,Özel Değil,38.8,26.971
2,2021-01-03,izmir-aliaga,4.0,0.0,Özel Değil,38.8,26.971
3,2021-01-04,izmir-aliaga,9.0,0.0,Özel Değil,38.8,26.971
4,2021-01-05,izmir-aliaga,2.0,0.0,Özel Değil,38.8,26.971


### Weather

In [19]:
to_extract_cols = ['t_2m:C', 'effective_cloud_cover:p',
                   'global_rad:W', 'relative_humidity_2m:p', 'wind_dir_10m:d',
                   'wind_speed_10m:ms', 'prob_precip_1h:p', 't_apparent:C']

In [20]:
def add_suffix(cols, suffix):
    return list(map(lambda x:x+suffix, cols))

In [21]:
weather['tarih'] = pd.to_datetime(weather['date']).dt.date

In [22]:
weather['x'] = np.sin(weather['wind_dir_10m:d'] / 180 * np.pi) * weather['wind_speed_10m:ms']
weather['y'] = np.cos(weather['wind_dir_10m:d'] / 180 * np.pi) * weather['wind_speed_10m:ms']

weather[['x', 'y']] = weather.groupby(['tarih', 'ilce'])[['x', 'y']].cumsum()

to_extract_cols.extend(['x', 'y'])

In [23]:
grouped = weather.groupby(['tarih', 'ilce'])
grouped_only_date = weather.groupby(['tarih'])

In [24]:
from functools import reduce

data_frames = [
    grouped[to_extract_cols].mean().reset_index().rename(columns=dict(zip(to_extract_cols, add_suffix(to_extract_cols, '_mean')))),
    grouped[to_extract_cols].std().reset_index().rename(columns=dict(zip(to_extract_cols, add_suffix(to_extract_cols, '_std')))),
    grouped[to_extract_cols].min().reset_index().rename(columns=dict(zip(to_extract_cols, add_suffix(to_extract_cols, '_min')))),
    grouped[to_extract_cols].max().reset_index().rename(columns=dict(zip(to_extract_cols, add_suffix(to_extract_cols, '_max')))),
    grouped[to_extract_cols].median().reset_index().rename(columns=dict(zip(to_extract_cols, add_suffix(to_extract_cols, '_median')))),
]

df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['tarih', 'ilce'],
                                            how='outer'), data_frames)

In [25]:
data_frames = [grouped_only_date[to_extract_cols].mean().reset_index().rename(columns=dict(zip(to_extract_cols, add_suffix(to_extract_cols, '_only_date_mean')))),
    grouped_only_date[to_extract_cols].std().reset_index().rename(columns=dict(zip(to_extract_cols, add_suffix(to_extract_cols, '_only_date_std')))).drop(columns='tarih'),
    grouped_only_date[to_extract_cols].min().reset_index().rename(columns=dict(zip(to_extract_cols, add_suffix(to_extract_cols, '_only_date_min')))).drop(columns='tarih'),
    grouped_only_date[to_extract_cols].max().reset_index().rename(columns=dict(zip(to_extract_cols, add_suffix(to_extract_cols, '_only_date_max')))).drop(columns='tarih'),
    grouped_only_date[to_extract_cols].median().reset_index().rename(columns=dict(zip(to_extract_cols, add_suffix(to_extract_cols, '_only_date_median')))).drop(columns='tarih'),
    ]

day_only_weather = pd.concat(data_frames, axis=1)

### PCA

In [26]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

component_cnt=2

if False:
    to_tsne_cols = df_merged.columns[2:]

    tsne = TSNE(n_components=component_cnt, learning_rate='auto',
                     init='random', perplexity=3, random_state=42)
    tsne_cols = ["weather_tsne_{0}".format(i) for i in range(component_cnt)]
    df_merged[tsne_cols] = tsne.fit_transform(df_merged[to_tsne_cols])

if True:
    to_pca_cols = df_merged.columns[2:]

    pca = PCA(n_components=component_cnt)
    pca_cols = ["weather_pca_{0}".format(i) for i in range(component_cnt)]
    df_merged[pca_cols] = pca.fit_transform(df_merged[to_pca_cols])

    df_merged = df_merged.drop(columns=to_pca_cols)

In [27]:
train['tarih'] = pd.to_datetime(train['tarih'])
df_merged['tarih'] = pd.to_datetime(df_merged['tarih'])
day_only_weather['tarih'] = pd.to_datetime(day_only_weather['tarih'])
test['tarih'] = pd.to_datetime(test['tarih'])

train = pd.merge(train, df_merged, on=['tarih', 'ilce'], how='left')
test = pd.merge(test, df_merged, on=['tarih', 'ilce'], how='left')

train = pd.merge(train, day_only_weather, on=['tarih'], how='left')
test = pd.merge(test, day_only_weather, on=['tarih'], how='left')
train.head()

Unnamed: 0,tarih,ilce,bildirimsiz_sum,bildirimli_sum,Bayram_Flag,lat,lon,t_2m:C_only_date_mean,effective_cloud_cover:p_only_date_mean,global_rad:W_only_date_mean,relative_humidity_2m:p_only_date_mean,wind_dir_10m:d_only_date_mean,wind_speed_10m:ms_only_date_mean,prob_precip_1h:p_only_date_mean,t_apparent:C_only_date_mean,x_only_date_mean,y_only_date_mean,t_2m:C_only_date_std,effective_cloud_cover:p_only_date_std,global_rad:W_only_date_std,relative_humidity_2m:p_only_date_std,wind_dir_10m:d_only_date_std,wind_speed_10m:ms_only_date_std,prob_precip_1h:p_only_date_std,t_apparent:C_only_date_std,x_only_date_std,y_only_date_std,t_2m:C_only_date_min,effective_cloud_cover:p_only_date_min,global_rad:W_only_date_min,relative_humidity_2m:p_only_date_min,wind_dir_10m:d_only_date_min,wind_speed_10m:ms_only_date_min,prob_precip_1h:p_only_date_min,t_apparent:C_only_date_min,x_only_date_min,y_only_date_min,t_2m:C_only_date_max,effective_cloud_cover:p_only_date_max,global_rad:W_only_date_max,relative_humidity_2m:p_only_date_max,wind_dir_10m:d_only_date_max,wind_speed_10m:ms_only_date_max,prob_precip_1h:p_only_date_max,t_apparent:C_only_date_max,x_only_date_max,y_only_date_max,t_2m:C_only_date_median,effective_cloud_cover:p_only_date_median,global_rad:W_only_date_median,relative_humidity_2m:p_only_date_median,wind_dir_10m:d_only_date_median,wind_speed_10m:ms_only_date_median,prob_precip_1h:p_only_date_median,t_apparent:C_only_date_median,x_only_date_median,y_only_date_median
0,2021-01-01,izmir-aliaga,5.0,0.0,New Year's Day,38.8,26.971,11.662,55.002,70.061,87.785,147.031,2.294,1.183,12.32,9.572,-23.538,2.383,21.316,108.669,7.464,48.183,1.34,1.192,3.351,13.533,24.562,4.3,1.2,0.0,51.9,5.9,0.0,1.0,2.8,-45.907,-124.996,17.0,99.9,426.1,100.0,358.4,8.5,18.6,19.9,50.752,28.167,11.8,55.35,0.0,89.4,148.05,1.9,1.0,12.3,8.926,-17.23
1,2021-01-02,izmir-aliaga,13.0,0.0,Özel Değil,38.8,26.971,11.713,31.31,95.989,81.099,114.472,1.771,1.0,12.447,17.553,-7.212,3.32,17.284,148.497,12.95,46.927,0.839,0.0,4.397,13.408,12.435,2.0,0.0,0.0,41.3,3.8,0.0,1.0,-0.1,-1.158,-54.586,18.5,98.1,480.8,100.0,357.1,5.9,1.0,21.1,74.874,34.951,11.5,29.5,0.0,83.1,115.5,1.7,1.0,11.9,15.085,-5.076
2,2021-01-03,izmir-aliaga,4.0,0.0,Özel Değil,38.8,26.971,11.176,59.064,46.919,77.365,123.741,1.832,3.267,11.471,16.313,-9.936,2.857,33.877,75.67,10.478,58.081,1.061,9.971,3.444,13.34,17.43,1.2,0.0,0.0,42.6,0.5,0.0,1.0,-0.8,-0.116,-84.688,17.1,100.0,394.4,100.0,359.9,5.8,92.9,19.4,77.131,39.128,11.4,58.6,0.0,78.0,125.45,1.6,1.0,11.6,12.795,-5.907
3,2021-01-04,izmir-aliaga,9.0,0.0,Özel Değil,38.8,26.971,12.199,43.497,86.694,71.237,119.867,2.996,1.449,12.261,23.801,-13.923,3.653,21.726,133.772,13.471,34.999,2.03,2.733,4.229,19.885,22.577,0.9,0.0,0.0,37.3,2.1,0.1,1.0,-1.3,-2.025,-145.438,19.7,99.8,487.5,100.0,351.5,14.1,40.9,20.9,98.782,31.41,12.2,47.7,0.0,70.75,126.35,2.3,1.0,12.4,18.831,-7.268
4,2021-01-05,izmir-aliaga,2.0,0.0,Özel Değil,38.8,26.971,12.59,40.122,92.357,77.991,174.951,2.312,8.339,13.265,3.309,-24.841,2.555,28.949,140.37,12.551,57.485,1.546,21.975,3.476,13.946,21.044,5.5,0.0,0.0,45.9,2.5,0.2,1.0,4.4,-72.101,-90.347,17.4,100.0,434.8,100.0,356.7,13.8,95.0,20.4,31.508,25.11,12.7,32.5,0.0,78.9,177.65,2.0,1.0,13.1,5.543,-21.568


In [30]:
train.shape, test.shape

((53007, 57), (1363, 56))

In [31]:
def dt_features(df):
    df['ay'] = df['tarih'].dt.month
    # df['gün'] = df['tarih'].dt.day
    # df['yıl'] = df['tarih'].dt.year
    df['haftanın_günü'] = df['tarih'].dt.dayofweek
    # df['tarih'] = df['tarih'].apply(lambda x:x.value)
    return df

train = dt_features(train)
test = dt_features(test)

train

Unnamed: 0,tarih,ilce,bildirimsiz_sum,bildirimli_sum,Bayram_Flag,lat,lon,t_2m:C_only_date_mean,effective_cloud_cover:p_only_date_mean,global_rad:W_only_date_mean,relative_humidity_2m:p_only_date_mean,wind_dir_10m:d_only_date_mean,wind_speed_10m:ms_only_date_mean,prob_precip_1h:p_only_date_mean,t_apparent:C_only_date_mean,x_only_date_mean,y_only_date_mean,t_2m:C_only_date_std,effective_cloud_cover:p_only_date_std,global_rad:W_only_date_std,relative_humidity_2m:p_only_date_std,wind_dir_10m:d_only_date_std,wind_speed_10m:ms_only_date_std,prob_precip_1h:p_only_date_std,t_apparent:C_only_date_std,x_only_date_std,y_only_date_std,t_2m:C_only_date_min,effective_cloud_cover:p_only_date_min,global_rad:W_only_date_min,relative_humidity_2m:p_only_date_min,wind_dir_10m:d_only_date_min,wind_speed_10m:ms_only_date_min,prob_precip_1h:p_only_date_min,t_apparent:C_only_date_min,x_only_date_min,y_only_date_min,t_2m:C_only_date_max,effective_cloud_cover:p_only_date_max,global_rad:W_only_date_max,relative_humidity_2m:p_only_date_max,wind_dir_10m:d_only_date_max,wind_speed_10m:ms_only_date_max,prob_precip_1h:p_only_date_max,t_apparent:C_only_date_max,x_only_date_max,y_only_date_max,t_2m:C_only_date_median,effective_cloud_cover:p_only_date_median,global_rad:W_only_date_median,relative_humidity_2m:p_only_date_median,wind_dir_10m:d_only_date_median,wind_speed_10m:ms_only_date_median,prob_precip_1h:p_only_date_median,t_apparent:C_only_date_median,x_only_date_median,y_only_date_median,ay,gün,haftanın_günü
0,2021-01-01,izmir-aliaga,5.000,0.000,New Year's Day,38.800,26.971,11.662,55.002,70.061,87.785,147.031,2.294,1.183,12.320,9.572,-23.538,2.383,21.316,108.669,7.464,48.183,1.340,1.192,3.351,13.533,24.562,4.300,1.200,0.000,51.900,5.900,0.000,1.000,2.800,-45.907,-124.996,17.000,99.900,426.100,100.000,358.400,8.500,18.600,19.900,50.752,28.167,11.800,55.350,0.000,89.400,148.050,1.900,1.000,12.300,8.926,-17.230,1,1,4
1,2021-01-02,izmir-aliaga,13.000,0.000,Özel Değil,38.800,26.971,11.713,31.310,95.989,81.099,114.472,1.771,1.000,12.447,17.553,-7.212,3.320,17.284,148.497,12.950,46.927,0.839,0.000,4.397,13.408,12.435,2.000,0.000,0.000,41.300,3.800,0.000,1.000,-0.100,-1.158,-54.586,18.500,98.100,480.800,100.000,357.100,5.900,1.000,21.100,74.874,34.951,11.500,29.500,0.000,83.100,115.500,1.700,1.000,11.900,15.085,-5.076,1,2,5
2,2021-01-03,izmir-aliaga,4.000,0.000,Özel Değil,38.800,26.971,11.176,59.064,46.919,77.365,123.741,1.832,3.267,11.471,16.313,-9.936,2.857,33.877,75.670,10.478,58.081,1.061,9.971,3.444,13.340,17.430,1.200,0.000,0.000,42.600,0.500,0.000,1.000,-0.800,-0.116,-84.688,17.100,100.000,394.400,100.000,359.900,5.800,92.900,19.400,77.131,39.128,11.400,58.600,0.000,78.000,125.450,1.600,1.000,11.600,12.795,-5.907,1,3,6
3,2021-01-04,izmir-aliaga,9.000,0.000,Özel Değil,38.800,26.971,12.199,43.497,86.694,71.237,119.867,2.996,1.449,12.261,23.801,-13.923,3.653,21.726,133.772,13.471,34.999,2.030,2.733,4.229,19.885,22.577,0.900,0.000,0.000,37.300,2.100,0.100,1.000,-1.300,-2.025,-145.438,19.700,99.800,487.500,100.000,351.500,14.100,40.900,20.900,98.782,31.410,12.200,47.700,0.000,70.750,126.350,2.300,1.000,12.400,18.831,-7.268,1,4,0
4,2021-01-05,izmir-aliaga,2.000,0.000,Özel Değil,38.800,26.971,12.590,40.122,92.357,77.991,174.951,2.312,8.339,13.265,3.309,-24.841,2.555,28.949,140.370,12.551,57.485,1.546,21.975,3.476,13.946,21.044,5.500,0.000,0.000,45.900,2.500,0.200,1.000,4.400,-72.101,-90.347,17.400,100.000,434.800,100.000,356.700,13.800,95.000,20.400,31.508,25.110,12.700,32.500,0.000,78.900,177.650,2.000,1.000,13.100,5.543,-21.568,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53002,2024-01-27,manisa-ahmetli,0.000,0.000,Özel Değil,38.618,28.671,6.572,45.533,114.183,72.592,159.478,1.363,1.770,7.141,3.272,7.062,3.600,42.249,179.514,14.624,117.913,0.747,2.852,4.958,11.161,8.891,-3.200,0.000,0.000,40.300,0.000,0.100,1.000,-4.800,-30.973,-10.821,13.900,100.000,570.800,99.300,359.700,5.800,20.400,17.500,49.199,39.225,6.800,27.500,0.000,76.050,116.350,1.300,1.000,6.800,2.385,5.090,1,27,5
53003,2024-01-28,manisa-ahmetli,0.000,0.000,Özel Değil,38.618,28.671,5.894,23.641,130.604,73.348,97.912,3.141,1.002,5.373,10.919,27.092,2.886,26.486,191.111,15.372,125.333,1.794,0.054,4.647,18.585,26.857,-4.400,0.000,0.000,39.300,0.100,0.100,1.000,-7.600,-28.427,-12.622,12.300,100.000,550.300,100.000,360.000,9.300,2.500,16.600,99.586,154.699,5.800,13.350,0.000,76.250,33.750,2.700,1.000,4.300,5.244,18.994,1,28,6
53004,2024-01-29,manisa-ahmetli,0.000,1.000,Özel Değil,38.618,28.671,4.243,41.977,113.066,71.978,71.774,4.633,1.000,2.159,16.485,44.269,2.799,32.546,168.690,12.043,110.809,2.438,0.000,3.845,21.643,40.562,-4.300,0.000,0.000,33.500,0.000,0.100,1.000,-7.000,-36.051,-24.350,9.500,100.000,539.900,100.000,359.800,13.300,1.000,12.100,120.232,221.676,4.300,35.900,0.000,73.400,25.300,4.400,1.000,1.200,10.400,32.978,1,29,0
53005,2024-01-30,manisa-ahmetli,0.000,0.000,Özel Değil,38.618,28.671,4.929,70.932,98.148,65.672,60.437,5.298,4.945,2.450,23.230,55.280,2.853,31.074,145.230,13.434,94.875,2.551,11.803,3.912,26.189,45.941,-4.000,0.400,0.000,33.900,0.100,0.300,1.000,-8.300,-20.995,-15.054,10.600,100.000,497.600,91.900,359.800,13.900,94.900,13.500,137.015,270.574,5.000,84.650,0.000,67.500,29.400,5.100,1.000,1.800,15.419,44.993,1,30,1


### Gözlem Ağırlıkları

In [32]:
# log fonksiyonu
weight_df = pd.DataFrame({'tarih': np.sort(train['tarih'].unique()), 'weights': np.log1p(list(range(train['tarih'].nunique()))) + 1})

train = train.merge(weight_df, on='tarih', how='left')

In [34]:
%%time
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.features.generators import AutoMLPipelineFeatureGenerator

time_limit = 3600*16

feature_generator = AutoMLPipelineFeatureGenerator(enable_datetime_features=False)
#feature_generator.fit(X=train.drop(columns='bildirimsiz_sum'), y=train['bildirimsiz_sum'])
automl = TabularPredictor(label='bildirimsiz_sum', problem_type='regression',
                          eval_metric="mean_absolute_error", 
                          sample_weight='weights',
                          # weight_evaluation=True    
                          )

automl.fit(train, presets='medium_quality', time_limit=time_limit, num_bag_folds=5, num_bag_sets=0, num_stack_levels=1, dynamic_stacking=False, 
            included_model_types=['XGB', 'CAT', 'XT', 'RF', 'GBM'], ag_args_fit={'num_gpus': 1, 'num_cpus': 8}, feature_generator=feature_generator,
          )

No path specified. Models will be saved in: "AutogluonModels\ag-20240602_193647"
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 57600s
AutoGluon will save models to "AutogluonModels\ag-20240602_193647"
AutoGluon Version:  1.1.1b20240426
Python Version:     3.10.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       5.65 GB / 15.42 GB (36.7%)
Disk Space Avail:   553.64 GB / 931.51 GB (59.4%)
Train Data Rows:    53007
Train Data Columns: 59
Label Column:       bildirimsiz_sum
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5788.94 MB
	Train Data (Original)  Memory Usage: 31.06 MB (0.5% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.


CPU times: total: 1h 25s
Wall time: 13min 37s


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x26e97dcb100>

In [47]:
automl.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-2.358,mean_absolute_error,10.155,675.828,0.002,0.163,3,True,10
1,CatBoost_BAG_L2,-2.362,mean_absolute_error,6.483,627.95,0.065,62.325,2,True,7
2,XGBoost_BAG_L2,-2.366,mean_absolute_error,6.802,574.147,0.384,8.522,2,True,9
3,WeightedEnsemble_L2,-2.384,mean_absolute_error,3.534,540.291,0.002,0.092,2,True,5
4,CatBoost_BAG_L1,-2.398,mean_absolute_error,0.104,416.092,0.104,416.092,1,True,2
5,ExtraTreesMSE_BAG_L2,-2.4,mean_absolute_error,9.704,604.818,3.286,39.193,2,True,8
6,RandomForestMSE_BAG_L2,-2.423,mean_absolute_error,9.858,677.492,3.441,111.867,2,True,6
7,XGBoost_BAG_L1,-2.452,mean_absolute_error,0.691,30.819,0.691,30.819,1,True,4
8,RandomForestMSE_BAG_L1,-2.581,mean_absolute_error,2.738,93.287,2.738,93.287,1,True,1
9,ExtraTreesMSE_BAG_L1,-2.59,mean_absolute_error,2.885,25.426,2.885,25.426,1,True,3


In [48]:
predictions = automl.predict(test, model='WeightedEnsemble_L3')

In [49]:
test['unique_id'] = pd.to_datetime(test['tarih']).apply(lambda x:str(x)[:10])
test['unique_id'] = test['unique_id'] + "-" + test['ilce']

In [51]:
test['bildirimsiz_sum'] = predictions
submission = test[['unique_id', 'bildirimsiz_sum']]
submission.loc[submission['bildirimsiz_sum'] < 0, 'bildirimsiz_sum'] = 0
submission['bildirimsiz_sum'] = round(submission['bildirimsiz_sum'])

In [53]:
submission.head()

Unnamed: 0,unique_id,bildirimsiz_sum
0,2024-02-01-izmir-aliaga,4.0
1,2024-02-01-izmir-bayindir,3.0
2,2024-02-01-izmir-bayrakli,3.0
3,2024-02-01-izmir-bergama,5.0
4,2024-02-01-izmir-bornova,8.0


In [54]:
submission.bildirimsiz_sum.mean()

4.87234

In [55]:
submission.to_csv(r'submission.csv', index=False)