In [1]:
import re
import datetime
import numpy as np 
import pandas as pd  
import seaborn as sns  
import matplotlib.pyplot as plt  

import warnings 
warnings.filterwarnings('ignore')

import catboost as cb
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_absolute_error

In [2]:
train_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/train.csv")
test_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/test.csv")
holidays_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/holidays.csv")
sample_submission = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/sample_submission.csv")
weather_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/weather.csv")

In [3]:
# formatting
holidays_data.rename(columns={'Tatil Adı': 'Bayram_Flag'}, inplace=True)
weather_data.rename(columns={'name': 'ilce'}, inplace=True)
weather_data['ilce'] = weather_data['ilce'].str.lower()

In [4]:
# merge by average

weather_data["date"] = pd.to_datetime(weather_data["date"], format="%Y-%m-%d %H:%M:%S")
weather_data["Yıl"] = weather_data["date"].dt.year
weather_data["Ay"] = weather_data["date"].dt.month
weather_data["Gün"] = weather_data["date"].dt.day
weather_data["Saat"] = weather_data["date"].dt.hour
weather_data = weather_data.drop(columns=["date"], axis = 1)

grouped_data = weather_data.groupby(['ilce', 'Yıl', 'Ay', 'Gün'])
daily_weather = grouped_data.mean()
daily_weather = daily_weather.reset_index()
daily_weather = daily_weather.drop(columns = ["Saat"], axis = 1)
daily_weather_holidays = pd.merge(daily_weather, holidays_data, on=['Yıl', 'Ay', 'Gün'], how='left')

# merge
train_data["tarih"] = pd.to_datetime(train_data["tarih"], format="%Y-%m-%d")

In [5]:
# date formatting
train_data["gun_adi"] = train_data["tarih"].dt.strftime("%A")
train_data["Yıl"] = train_data["tarih"].dt.year
train_data["Ay"] = train_data["tarih"].dt.month
train_data["Gün"] = train_data["tarih"].dt.day
train_data = train_data.drop(columns=["tarih"], axis = 1)

df_train = pd.merge(daily_weather_holidays, train_data, on=['ilce','Yıl', 'Ay', 'Gün'], how='right')

# test data merge
test_data["tarih"] = pd.to_datetime(test_data["tarih"], format="%Y-%m-%d")

test_data["gun_adi"] = test_data["tarih"].dt.strftime("%A")
test_data["Yıl"] = test_data["tarih"].dt.year
test_data["Ay"] = test_data["tarih"].dt.month
test_data["Gün"] = test_data["tarih"].dt.day

test_data = test_data.drop(columns=["tarih"], axis = 1)

# merge
df_test = pd.merge(daily_weather_holidays, test_data, on=['ilce','Yıl', 'Ay', 'Gün'], how='right')

In [6]:
# separate il-ilce
df_train[['il', 'ilcee']] = df_train['ilce'].str.split('-', expand=True)
df_test[['il', 'ilcee']] = df_test['ilce'].str.split('-', expand=True)

df_train = df_train.drop(columns=['ilce'])
df_test = df_test.drop(columns=['ilce'])

# day mapping
df_train["gun_tipi"] = df_train["gun_adi"].apply(lambda x: "Haftasonu" if x in ["Saturday", "Sunday"] else "Haftaiçi")
df_test["gun_tipi"] = df_test["gun_adi"].apply(lambda x: "Haftasonu" if x in ["Saturday", "Sunday"] else "Haftaiçi")

# temperature
df_train["sicaklik_fark"] = df_train["t_2m:C"] - df_train["t_apparent:C"]
df_test["sicaklik_fark"] = df_test["t_2m:C"] - df_test["t_apparent:C"]

In [7]:
# replace _ with :
def clean_feature_names(df):
    df.columns = [re.sub(r'[^\w\s]', '_', col) for col in df.columns]
    return df

df_train = clean_feature_names(df_train)
df_test = clean_feature_names(df_test)

In [8]:
# wind direction
def ruzgar_yonu_hesapla(degree):
    yonler = {
        (0, 45): "Kuzey",
        (45, 90): "Kuzeydoğu",
        (90, 135): "Doğu",
        (135, 180): "Güneydoğu",
        (180, 225): "Güney",
        (225, 270): "Güneybatı",
        (270, 315): "Batı",
        (315, 360): "Kuzeybatı",
    }
    for (alt_sinir, ust_sinir), yon in yonler.items():
        if alt_sinir <= degree < ust_sinir:
            return yon
    return "Kuzeybatı"  # 315 - 360 

df_train['ruzgar_yonu'] = df_train['wind_dir_10m_d'].apply(ruzgar_yonu_hesapla)
df_test['ruzgar_yonu'] = df_test['wind_dir_10m_d'].apply(ruzgar_yonu_hesapla)

In [9]:
# MAP ENCODE FUNCTION
def map_encode(df, columns):
    df_encoded = df.copy()
    
    for column in columns:
        df_encoded[column] = df_encoded[column].astype(str)
        
        unique_values = sorted(df_encoded[column].unique())
        value_to_int = {value: idx for idx, value in enumerate(unique_values)}
        
        df_encoded[column] = df_encoded[column].map(value_to_int)
    
    return df_encoded

columns_to_encode = ['il', 'ilcee', 'gun_adi', 'gun_tipi', 'ruzgar_yonu']
df_train = map_encode(df_train, columns_to_encode)
df_test = map_encode(df_test, columns_to_encode)

In [10]:
# MAP ENCODING
# yil encoding
Yıl = {2021: 0, 2022: 1, 2023: 2, 2024: 3}

df_train['Yıl'] = df_train['Yıl'].map(Yıl)
df_test['Yıl'] = df_test['Yıl'].map(Yıl)

# Tatil encoding
Tatil = {'Sacrifice Feast Holiday* (*estimated)': 1, 'Ramadan Feast Holiday* (*estimated)': 2, "New Year's Day": 3, 'Democracy and National Unity Day': 4, 'Labour Day': 5,
        'Victory Day': 6, "Commemoration of Ataturk, Youth and Sports Day": 7, 'Ramadan Feast* (*estimated)': 8, 'Sacrifice Feast* (*estimated)': 9,
        'Republic Day': 10, "National Sovereignty and Children's Day": 11, "National Sovereignty and Children's Day; Ramadan Feast Holiday* (*estimated)": 12}

df_train['Bayram_Flag'] = df_train['Bayram_Flag'].map(Tatil)
df_test['Bayram_Flag'] = df_test['Bayram_Flag'].map(Tatil)

# not holiday = 0
df_train['Bayram_Flag'] = df_train['Bayram_Flag'].fillna(0)
df_test['Bayram_Flag'] = df_test['Bayram_Flag'].fillna(0)

In [11]:
# DROP
# gun drop: low correlation
df_train = df_train.drop("Gün", axis = 1)
df_test = df_test.drop("Gün", axis = 1)

# t_apparent:C hissedilen sicaklik drop: multicolinerity
df_train = df_train.drop(columns = "t_apparent_C", axis = 1)
df_test = df_test.drop(columns = "t_apparent_C", axis = 1)

# drop wind_dir_10m_d
df_train = df_train.drop('wind_dir_10m_d', axis = 1)
df_test = df_test.drop('wind_dir_10m_d', axis = 1)

In [12]:
# train-test-split
y = df_train['bildirimsiz_sum']                  
X = df_train.drop(['bildirimsiz_sum'], axis=1)
X_test = df_test

In [17]:
# TRAIN MODEL

# grow_policy='Depthwise'
#'iterations' = 5000
model = cb.CatBoostRegressor(eval_metric='MAE',loss_function='MAE',silent = True, , random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    cv_scores.append(mean_absolute_error(y_val, y_pred))
    mae_scores.append(mean_absolute_error(y_val, y_pred))
    r2_scores.append(r2_score(y_val, y_pred))

print("Cross-validation scores:", cv_scores)
print("Mean R^2 score:", sum(r2_scores) / len(r2_scores))
print("Mean Absolute Error score:", sum(mae_scores) / len(mae_scores))

Cross-validation scores: [2.638794093875536, 2.6400937681687378, 2.596486397664881, 2.6574741561028112, 2.6787140396344835]
Mean CV accuracy: 2.6423124910892897
Mean R^2 score: 0.3853904470294142
Mean Absolute Error score: 2.6423124910892897


In [18]:
y_pred = model.predict(X_test).astype(int)

In [19]:
sample_submission['bildirimsiz_sum'] = y_pred
sample_submission.to_csv('submission.csv', index=False)