In [1]:
import re
import datetime
import numpy as np 
import pandas as pd  
import seaborn as sns  
import matplotlib.pyplot as plt  

import warnings 
warnings.filterwarnings('ignore')

import catboost as cb
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

In [2]:
train_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/train.csv")
test_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/test.csv")
holidays_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/holidays.csv")
sample_submission = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/sample_submission.csv")
weather_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/weather.csv")

In [3]:
# formatting
holidays_data.rename(columns={'Tatil Adı': 'Bayram_Flag'}, inplace=True)
weather_data.rename(columns={'name': 'ilce'}, inplace=True)
weather_data['ilce'] = weather_data['ilce'].str.lower()

test_data['unique_id'] = test_data['tarih'] + '-' + test_data['ilce']

In [4]:
# add basic flag
train_data['il_kodu'] = train_data['ilce'].apply(lambda x: 0 if 'izmir' in x else 1)
test_data['il_kodu'] = test_data['ilce'].apply(lambda x: 0 if 'izmir' in x else 1)

In [5]:
# merge by average

weather_data["date"] = pd.to_datetime(weather_data["date"], format="%Y-%m-%d %H:%M:%S")
weather_data["Yıl"] = weather_data["date"].dt.year
weather_data["Ay"] = weather_data["date"].dt.month
weather_data["Gün"] = weather_data["date"].dt.day
weather_data["Saat"] = weather_data["date"].dt.hour
weather_data = weather_data.drop(columns=["date"], axis = 1)

grouped_data = weather_data.groupby(['ilce', 'Yıl', 'Ay', 'Gün'])
daily_weather = grouped_data.mean()
daily_weather = daily_weather.reset_index()
daily_weather = daily_weather.drop(columns = ["Saat"], axis = 1)
daily_weather_holidays = pd.merge(daily_weather, holidays_data, on=['Yıl', 'Ay', 'Gün'], how='left')

# merge
train_data["tarih"] = pd.to_datetime(train_data["tarih"], format="%Y-%m-%d")

In [6]:
# date formatting
train_data["gun_adi"] = train_data["tarih"].dt.strftime("%A")
train_data["Yıl"] = train_data["tarih"].dt.year
train_data["Ay"] = train_data["tarih"].dt.month
train_data["Gün"] = train_data["tarih"].dt.day
train_data = train_data.drop(columns=["tarih"], axis = 1)

df_train = pd.merge(daily_weather_holidays, train_data, on=['ilce','Yıl', 'Ay', 'Gün'], how='right')

# test data merge
test_data["tarih"] = pd.to_datetime(test_data["tarih"], format="%Y-%m-%d")

test_data["gun_adi"] = test_data["tarih"].dt.strftime("%A")
test_data["Yıl"] = test_data["tarih"].dt.year
test_data["Ay"] = test_data["tarih"].dt.month
test_data["Gün"] = test_data["tarih"].dt.day

test_data = test_data.drop(columns=["tarih"], axis = 1)

# merge
df_test = pd.merge(daily_weather_holidays, test_data, on=['ilce','Yıl', 'Ay', 'Gün'], how='right')

In [7]:
# separate il-ilce
df_train[['il', 'ilcee']] = df_train['ilce'].str.split('-', expand=True)
df_test[['il', 'ilcee']] = df_test['ilce'].str.split('-', expand=True)

df_train = df_train.drop(columns=['ilce'])
df_test = df_test.drop(columns=['ilce'])

# day mapping
df_train["gun_tipi"] = df_train["gun_adi"].apply(lambda x: "Haftasonu" if x in ["Saturday", "Sunday"] else "Haftaiçi")
df_test["gun_tipi"] = df_test["gun_adi"].apply(lambda x: "Haftasonu" if x in ["Saturday", "Sunday"] else "Haftaiçi")

# temperature
df_train["sicaklik_fark"] = df_train["t_2m:C"] - df_train["t_apparent:C"]
df_test["sicaklik_fark"] = df_test["t_2m:C"] - df_test["t_apparent:C"]

In [8]:
# replace _ with :
def clean_feature_names(df):
    df.columns = [re.sub(r'[^\w\s]', '_', col) for col in df.columns]
    return df

df_train = clean_feature_names(df_train)
df_test = clean_feature_names(df_test)


In [9]:
# wind direction
def ruzgar_yonu_hesapla(degree):
    yonler = {
        (0, 45): "Kuzey",
        (45, 90): "Kuzeydoğu",
        (90, 135): "Doğu",
        (135, 180): "Güneydoğu",
        (180, 225): "Güney",
        (225, 270): "Güneybatı",
        (270, 315): "Batı",
        (315, 360): "Kuzeybatı",
    }
    for (alt_sinir, ust_sinir), yon in yonler.items():
        if alt_sinir <= degree < ust_sinir:
            return yon
    return "Kuzeybatı"  # 315 - 360 derece için

df_train['ruzgar_yonu'] = df_train['wind_dir_10m_d'].apply(ruzgar_yonu_hesapla)
df_test['ruzgar_yonu'] = df_test['wind_dir_10m_d'].apply(ruzgar_yonu_hesapla)

In [10]:
# il - ilce encoding
il = {'izmir':0, 'manisa':1}

df_train['il'] = df_train['il'].map(il)
df_test['il'] = df_test['il'].map(il)

ilce = {
    'akhisar':0, 'salihli':1, 'menderes':2, 'cesme':3, 'yunusemre':4, "torbali":5, "konak":6, "odemis":7, "bornova":8, "sehzadeler":9, "urla":10,
    "turgutlu":11, "bergama":12, "alasehir":13, "menemen":14, "dikili":15, "kemalpasa":16, "buca":17, "seferihisar":18, "tire":19, "saruhanli":20,
    "aliaga":21, "bayindir":22, "karabaglar":23, "kiraz":24, "karaburun":25, "bayrakli":26, "foca":27, "soma":28, "karsiyaka":29, "cigli":30,
    "gordes":31, "kula":32, "sarigol":33, "selendi":34, "kirkagac":35, "demirci":36, "gaziemir":37, "kinik":38, "selcuk":39, "guzelbahce":40,
    "koprubasi":41, "narlidere":42, "balcova":43, "beydag":44, "ahmetli":45, "golmarmara":46
}

df_train['ilcee'] = df_train['ilcee'].map(ilce)
df_test['ilcee'] = df_test['ilcee'].map(ilce)

# yil encoding
Yıl = {2021: 0, 2022: 1, 2023: 2, 2024: 3}

df_train['Yıl'] = df_train['Yıl'].map(Yıl)
df_test['Yıl'] = df_test['Yıl'].map(Yıl)

# Tatil encoding
Tatil = {'Sacrifice Feast Holiday* (*estimated)': 1, 'Ramadan Feast Holiday* (*estimated)': 2, "New Year's Day": 3, 'Democracy and National Unity Day': 4, 'Labour Day': 5,
        'Victory Day': 6, "Commemoration of Ataturk, Youth and Sports Day": 7, 'Ramadan Feast* (*estimated)': 8, 'Sacrifice Feast* (*estimated)': 9,
        'Republic Day': 10, "National Sovereignty and Children's Day": 11, "National Sovereignty and Children's Day; Ramadan Feast Holiday* (*estimated)": 12}

df_train['Bayram_Flag'] = df_train['Bayram_Flag'].map(Tatil)
df_test['Bayram_Flag'] = df_test['Bayram_Flag'].map(Tatil)

# not holiday = 0
df_train['Bayram_Flag'] = df_train['Bayram_Flag'].fillna(0)
df_test['Bayram_Flag'] = df_test['Bayram_Flag'].fillna(0)

# gun encoding
Günler = {'Monday':0, 'Tuesday':1, 'Wednesday':2, 'Thursday':3, 'Friday':4, 'Saturday':5, 'Sunday':6}

df_train['gun_adi'] = df_train['gun_adi'].map(Günler)
df_test['gun_adi'] = df_test['gun_adi'].map(Günler)

# gun_tipi encoding
Gün_tipi = {'Haftaiçi':0, 'Haftasonu':1}

df_train['gun_tipi'] = df_train['gun_tipi'].map(Gün_tipi)
df_test['gun_tipi'] = df_test['gun_tipi'].map(Gün_tipi)

# ruzgar_yonu encoding
Rüzgar_yönü = {'Kuzey':0, 'Kuzeydoğu':1, 'Doğu':2, 'Güneydoğu':3, 'Güney':4, 'Güneybatı':5, 'Batı':6, 'Kuzeybatı':7}

df_train['ruzgar_yonu'] = df_train['ruzgar_yonu'].map(Rüzgar_yönü)
df_test['ruzgar_yonu'] = df_test['ruzgar_yonu'].map(Rüzgar_yönü)

df_train = df_train.drop('wind_dir_10m_d', axis = 1)
df_test = df_test.drop('wind_dir_10m_d', axis = 1)


In [11]:
# DROP
# gun drop: low correlation
df_train = df_train.drop("Gün", axis = 1)
df_test = df_test.drop("Gün", axis = 1)

# t_apparent:C hissedilen sicaklik drop: multicolinerity
df_train = df_train.drop(columns = "t_apparent_C", axis = 1)
df_test = df_test.drop(columns = "t_apparent_C", axis = 1)

In [12]:
# One Hot Encoding - YAPILMIYOR
columns_to_encode = ['Yıl', 'ilcee']

encoder = OneHotEncoder(sparse=False, drop='first')

encoded_train = encoder.fit_transform(df_train[columns_to_encode])
encoded_test = encoder.transform(df_test[columns_to_encode])

encoded_columns = encoder.get_feature_names_out(columns_to_encode)

#df_train = pd.concat([df_train.drop(columns_to_encode, axis=1), pd.DataFrame(encoded_train, columns=encoded_columns)], axis=1)
#df_test = pd.concat([df_test.drop(columns_to_encode, axis=1), pd.DataFrame(encoded_test, columns=encoded_columns)], axis=1)

In [13]:
# train-test-split
y = df_train['bildirimsiz_sum']                  
X = df_train.drop(['bildirimsiz_sum'], axis=1)
X_test = df_test.drop(["unique_id"], axis=1)

#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# MODEL
model = cb.CatBoostRegressor(eval_metric='MAE', loss_function='MAE', silent=True, random_state=42)
model.fit(X, y)

y_pred = model.predict(X)

mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)
# 2.388729683470563, 0.44899031955046553

In [14]:
# MODEL TRAIN

#model = cb.CatBoostRegressor(
#    eval_metric='MAE',
#    loss_function='MAE',
#    silent=True,
#    random_state=42,
#    depth=10, 
#    iterations=1000, 
#    learning_rate=0.1, 
#    l2_leaf_reg=3 
#)


# grow_policy='Depthwise'
model = cb.CatBoostRegressor(eval_metric='MAE',loss_function='MAE',silent = True, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # MODEL
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    # CV
    cv_scores.append(mean_absolute_error(y_val, y_pred))
    mae_scores.append(mean_absolute_error(y_val, y_pred))
    r2_scores.append(r2_score(y_val, y_pred))

print("Cross-validation scores:", cv_scores)
print("Mean CV score:", sum(cv_scores) / len(cv_scores))
print("Mean R^2 score:", sum(r2_scores) / len(r2_scores))
print("Mean Absolute Error score:", sum(mae_scores) / len(mae_scores))

Cross-validation scores: [2.638758904917828, 2.6402330538986267, 2.601399775631201, 2.6597488085074628, 2.6719740114720003]
Mean CV score: 2.642422910885424
Mean R^2 score: 0.37972759560461966
Mean Absolute Error score: 2.642422910885424


In [15]:
y_pred = model.predict(X_test).astype(int)

In [16]:
sample_submission['bildirimsiz_sum'] = y_pred
sample_submission.to_csv('submission.csv', index=False)