In [3]:
import re
import numpy as np 
import pandas as pd  

import warnings 
warnings.filterwarnings('ignore')

In [4]:
train_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/train.csv")
test_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/test.csv")
holidays_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/holidays.csv")
sample_submission = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/sample_submission.csv")
weather_data = pd.read_csv("/home/enes/Desktop/GDZ_datathon/data/weather.csv")

In [5]:
# formatting
holidays_data.rename(columns={'Tatil Adı': 'Bayram_Flag'}, inplace=True)
weather_data.rename(columns={'name': 'ilce'}, inplace=True)
weather_data['ilce'] = weather_data['ilce'].str.lower()

In [6]:
# merge by average

weather_data["date"] = pd.to_datetime(weather_data["date"], format="%Y-%m-%d %H:%M:%S")
weather_data["Yıl"] = weather_data["date"].dt.year
weather_data["Ay"] = weather_data["date"].dt.month
weather_data["Gün"] = weather_data["date"].dt.day
weather_data["Saat"] = weather_data["date"].dt.hour
weather_data = weather_data.drop(columns=["date"], axis = 1)

grouped_data = weather_data.groupby(['ilce', 'Yıl', 'Ay', 'Gün'])
daily_weather = grouped_data.mean()
daily_weather = daily_weather.reset_index()
daily_weather = daily_weather.drop(columns = ["Saat"], axis = 1)
daily_weather_holidays = pd.merge(daily_weather, holidays_data, on=['Yıl', 'Ay', 'Gün'], how='left')

# merge
train_data["tarih"] = pd.to_datetime(train_data["tarih"], format="%Y-%m-%d")

In [7]:
# date formatting
train_data["gun_adi"] = train_data["tarih"].dt.strftime("%A")
train_data["Yıl"] = train_data["tarih"].dt.year
train_data["Ay"] = train_data["tarih"].dt.month
train_data["Gün"] = train_data["tarih"].dt.day
train_data = train_data.drop(columns=["tarih"], axis = 1)

df_train = pd.merge(daily_weather_holidays, train_data, on=['ilce','Yıl', 'Ay', 'Gün'], how='right')

# test data merge
test_data["tarih"] = pd.to_datetime(test_data["tarih"], format="%Y-%m-%d")

test_data["gun_adi"] = test_data["tarih"].dt.strftime("%A")
test_data["Yıl"] = test_data["tarih"].dt.year
test_data["Ay"] = test_data["tarih"].dt.month
test_data["Gün"] = test_data["tarih"].dt.day

test_data = test_data.drop(columns=["tarih"], axis = 1)

# merge
df_test = pd.merge(daily_weather_holidays, test_data, on=['ilce','Yıl', 'Ay', 'Gün'], how='right')

In [8]:
# separate il-ilce
df_train[['il', 'ilcee']] = df_train['ilce'].str.split('-', expand=True)
df_test[['il', 'ilcee']] = df_test['ilce'].str.split('-', expand=True)

df_train = df_train.drop(columns=['ilce'])
df_test = df_test.drop(columns=['ilce'])

# day mapping
df_train["gun_tipi"] = df_train["gun_adi"].apply(lambda x: "Haftasonu" if x in ["Saturday", "Sunday"] else "Haftaiçi")
df_test["gun_tipi"] = df_test["gun_adi"].apply(lambda x: "Haftasonu" if x in ["Saturday", "Sunday"] else "Haftaiçi")

# temperature
df_train["sicaklik_fark"] = df_train["t_2m:C"] - df_train["t_apparent:C"]
df_test["sicaklik_fark"] = df_test["t_2m:C"] - df_test["t_apparent:C"]

In [9]:
# replace _ with :
def clean_feature_names(df):
    df.columns = [re.sub(r'[^\w\s]', '_', col) for col in df.columns]
    return df

df_train = clean_feature_names(df_train)
df_test = clean_feature_names(df_test)

In [10]:
# wind direction
def ruzgar_yonu_hesapla(degree):
    yonler = {
        (0, 45): "Kuzey",
        (45, 90): "Kuzeydoğu",
        (90, 135): "Doğu",
        (135, 180): "Güneydoğu",
        (180, 225): "Güney",
        (225, 270): "Güneybatı",
        (270, 315): "Batı",
        (315, 360): "Kuzeybatı",
    }
    for (alt_sinir, ust_sinir), yon in yonler.items():
        if alt_sinir <= degree < ust_sinir:
            return yon
    return "Kuzeybatı"  # 315 - 360 

df_train['ruzgar_yonu'] = df_train['wind_dir_10m_d'].apply(ruzgar_yonu_hesapla)
df_test['ruzgar_yonu'] = df_test['wind_dir_10m_d'].apply(ruzgar_yonu_hesapla)

In [11]:
# MAP ENCODE FUNCTION
def map_encode(df, columns):
    df_encoded = df.copy()
    
    for column in columns:
        df_encoded[column] = df_encoded[column].astype(str)
        
        unique_values = sorted(df_encoded[column].unique())
        value_to_int = {value: idx for idx, value in enumerate(unique_values)}
        
        df_encoded[column] = df_encoded[column].map(value_to_int)
    
    return df_encoded

columns_to_encode = ['il', 'ilcee', 'gun_adi', 'gun_tipi', 'ruzgar_yonu']
df_train = map_encode(df_train, columns_to_encode)
df_test = map_encode(df_test, columns_to_encode)

In [12]:
# MAP ENCODING
# yil encoding
Yıl = {2021: 0, 2022: 1, 2023: 2, 2024: 3}

df_train['Yıl'] = df_train['Yıl'].map(Yıl)
df_test['Yıl'] = df_test['Yıl'].map(Yıl)

# Tatil encoding
Tatil = {'Sacrifice Feast Holiday* (*estimated)': 1, 'Ramadan Feast Holiday* (*estimated)': 2, "New Year's Day": 3, 'Democracy and National Unity Day': 4, 'Labour Day': 5,
        'Victory Day': 6, "Commemoration of Ataturk, Youth and Sports Day": 7, 'Ramadan Feast* (*estimated)': 8, 'Sacrifice Feast* (*estimated)': 9,
        'Republic Day': 10, "National Sovereignty and Children's Day": 11, "National Sovereignty and Children's Day; Ramadan Feast Holiday* (*estimated)": 12}

df_train['Bayram_Flag'] = df_train['Bayram_Flag'].map(Tatil)
df_test['Bayram_Flag'] = df_test['Bayram_Flag'].map(Tatil)

# not holiday = 0
df_train['Bayram_Flag'] = df_train['Bayram_Flag'].fillna(0)
df_test['Bayram_Flag'] = df_test['Bayram_Flag'].fillna(0)

In [13]:
# DROP
# gun drop: low correlation
df_train = df_train.drop("Gün", axis = 1)
df_test = df_test.drop("Gün", axis = 1)

# t_apparent:C hissedilen sicaklik drop: multicolinerity
df_train = df_train.drop(columns = "t_apparent_C", axis = 1)
df_test = df_test.drop(columns = "t_apparent_C", axis = 1)

# drop wind_dir_10m_d
df_train = df_train.drop('wind_dir_10m_d', axis = 1)
df_test = df_test.drop('wind_dir_10m_d', axis = 1)

In [16]:
# PYCARET
from pycaret.regression import *

df_train['bildirimsiz_sum'] = y
data = df_train

exp_reg = setup(data = data, target = 'bildirimsiz_sum', session_id=42)

best_model = compare_models(sort = 'MAE') 

predictions = predict_model(best_model, data = df_test)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,bildirimsiz_sum
2,Target type,Regression
3,Original data shape,"(48148, 19)"
4,Transformed data shape,"(48148, 19)"
5,Transformed train set shape,"(33703, 19)"
6,Transformed test set shape,"(14445, 19)"
7,Numeric features,18
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,2.7499,17.1035,4.1323,0.4144,0.5463,0.7442,139.218
rf,Random Forest Regressor,2.8752,18.5622,4.3052,0.3642,0.5626,0.7811,2.838
et,Extra Trees Regressor,2.9004,18.7653,4.3286,0.3572,0.5664,0.788,1.165
gbr,Gradient Boosting Regressor,2.9569,19.89,4.4561,0.3192,0.5885,0.8421,0.965
huber,Huber Regressor,3.5018,28.7864,5.3627,0.014,0.6925,0.9204,0.272
lr,Linear Regression,3.5894,26.8607,5.1801,0.08,0.7152,1.1292,0.286
ridge,Ridge Regression,3.5894,26.8607,5.1801,0.08,0.7152,1.1292,0.016
br,Bayesian Ridge,3.5897,26.8612,5.1802,0.08,0.7152,1.1297,0.015
lar,Least Angle Regression,3.6879,27.8703,5.2761,0.0448,0.7362,1.1514,0.017
en,Elastic Net,3.7226,28.1294,5.3014,0.0364,0.7394,1.2119,0.015
