In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, f_regression
import lightgbm as lgb
import xgboost as xgb
from pygam import LinearGAM
import numpy as np
from catboost import CatBoostRegressor

In [25]:
def load_and_split_train_data(train_file, target_column, id_column):
    data = pd.read_csv(train_file)
    ids = data[id_column]  # Keep the id column for reference
    X = data.drop(
        [target_column, id_column], axis=1
    )  # Drop target and id from features
    y = data[target_column]  # Target

    # Split into 70% training, 15% validation, 15% test
    X_train, X_temp, y_train, y_temp, ids_train, ids_temp = train_test_split(
        X, y, ids, test_size=0.3, random_state=42
    )
    X_val, X_test, y_val, y_test, ids_val, ids_test = train_test_split(
        X_temp, y_temp, ids_temp, test_size=0.5, random_state=42
    )

    return X_train, X_val, X_test, y_train, y_val, y_test, ids_train, ids_val, ids_test

In [26]:
def load_test_data(test_file, id_column):
    data = pd.read_csv(test_file)
    ids = data[id_column]  # Keep id for reference
    X_test = data.drop(id_column, axis=1)  # Drop id from features
    return X_test, ids

In [None]:
def get_search_spaces():
    search_spaces = {
        "LightGBM": {
            "pipeline": Pipeline(
                [
                    (
                        "feature_selection",
                        RFECV(estimator=lgb.LGBMRegressor(), step=1, cv=KFold(5)),
                    ),
                    ("regressor", lgb.LGBMRegressor()),
                ]
            ),
            "space": {
                "regressor__num_leaves": hp.choice(
                    "regressor__num_leaves", np.arange(2, 50 + 1, dtype=int)
                ),
                "regressor__learning_rate": hp.uniform(
                    "regressor__learning_rate", 0.01, 0.3
                ),
                "regressor__n_estimators": hp.choice(
                    "regressor__n_estimators",
                    np.arange(100, 250 + 1, dtype=int),  # değiştirildi
                ),
                "regressor__max_depth": hp.choice(
                    "regressor__max_depth", np.arange(3, 15 + 1, dtype=int)
                ),
                "regressor__min_child_samples": hp.choice(
                    "regressor__min_child_samples", np.arange(5, 30 + 1, dtype=int)
                ),
            },
        },
        "XGBoost": {
            "pipeline": Pipeline(
                [
                    (
                        "feature_selection",
                        RFECV(estimator=xgb.XGBRegressor(), step=1, cv=KFold(5)),
                    ),
                    ("regressor", xgb.XGBRegressor()),
                ]
            ),
            "space": {
                "regressor__max_depth": hp.choice(
                    "regressor__max_depth", np.arange(3, 10, dtype=int)
                ),
                "regressor__learning_rate": hp.uniform(
                    "regressor__learning_rate", 0.01, 0.3
                ),
                "regressor__n_estimators": hp.choice(
                    "regressor__n_estimators", np.arange(100, 1000 + 1, dtype=int)
                ),
                "regressor__min_child_weight": hp.quniform(
                    "regressor__min_child_weight", 1, 10, 1
                ),
                "regressor__gamma": hp.uniform("regressor__gamma", 0, 5),  # gamma >= 0
            },
        },
        "Ridge": {
            "pipeline": Pipeline(
                [
                    (
                        "feature_selection",
                        RFECV(estimator=Ridge(), step=1, cv=KFold(5)),
                    ),
                    ("regressor", Ridge()),
                ]
            ),
            "space": {
                "regressor__alpha": hp.loguniform(
                    "regressor__alpha", np.log(0.001), np.log(100)
                )
            },
        },
        "GAM": {
            "pipeline": Pipeline(
                [
                    (
                        "feature_selection",
                        SelectKBest(score_func=f_regression),
                    ),
                    ("regressor", LinearGAM()),
                ]
            ),
            "space": {
                "feature_selection__k": hp.choice(
                    "feature_selection__k", np.arange(5, 25 + 1, dtype=int)
                ),
                "regressor__lam": hp.loguniform(
                    "regressor__lam", np.log(0.01), np.log(10)
                ),
                "regressor__n_splines": hp.quniform("regressor__n_splines", 10, 50, 1),
            },
        },
        "CatBoost": {
            "pipeline": Pipeline(
                [
                    (
                        "feature_selection",
                        SelectKBest(score_func=f_regression),
                    ),
                    (
                        "regressor",
                        CatBoostRegressor(
                            verbose=100,
                            early_stopping_rounds=50,
                        ),
                    ),
                ]
            ),
            "space": {
                "feature_selection__k": hp.choice(
                    "feature_selection__k", np.arange(5, 25 + 1, dtype=int)
                ),
                "regressor__depth": hp.choice(
                    "regressor__depth", np.arange(3, 10 + 1, dtype=int)
                ),
                "regressor__learning_rate": hp.uniform(
                    "regressor__learning_rate", 0.01, 0.3
                ),
                "regressor__n_estimators": hp.choice(
                    "regressor__n_estimators", np.arange(100, 1000, dtype=int)
                ),
                "regressor__l2_leaf_reg": hp.uniform("regressor__l2_leaf_reg", 1, 10),
                "regressor__bagging_temperature": hp.uniform(
                    "regressor__bagging_temperature", 0, 1
                ),
            },
        },
    }
    return search_spaces

In [49]:
# Objective function for HyperOpt
def objective(params, model_name, X_train, y_train, X_val, y_val):
    model_data = get_search_spaces()[model_name]
    pipeline = model_data["pipeline"].set_params(**params)
    pipeline.fit(X_train, y_train)
    val_pred = pipeline.predict(X_val)
    rmse = root_mean_squared_error(y_val, val_pred)
    return {"loss": rmse, "status": STATUS_OK}

In [50]:
# HyperOpt optimization
def hyperopt_optimization(X_train, y_train, X_val, y_val):
    best_models = {}
    search_spaces = get_search_spaces()

    for model_name, model_data in search_spaces.items():
        print(f"Optimizing {model_name}...")
        trials = Trials()
        best_params = fmin(
            fn=lambda params: objective(
                params, model_name, X_train, y_train, X_val, y_val
            ),
            space=model_data["space"],
            algo=tpe.suggest,
            max_evals=50,
            trials=trials,
        )
        best_models[model_name] = best_params
        print(f"Best params for {model_name}: {best_params}")

    return best_models

In [51]:
def test_best_model(best_params, model_name, X_train, y_train, X_test, y_test):
    search_spaces = get_search_spaces()
    pipeline = search_spaces[model_name]["pipeline"].set_params(**best_params)
    pipeline.fit(X_train, y_train)
    test_pred = pipeline.predict(X_test)
    test_rmse = root_mean_squared_error(y_test, test_pred)
    print(f"Test RMSE for {model_name}: {test_rmse}")
    return test_rmse

In [52]:
# Modify predict_on_test_data to keep ids and merge with predictions
def predict_on_test_data(
    best_params, model_name, X_train, y_train, test_data, test_ids
):
    search_spaces = get_search_spaces()
    pipeline = search_spaces[model_name]["pipeline"].set_params(**best_params)
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(test_data)
    # Create a DataFrame to store ids and predictions
    predictions_df = pd.DataFrame({"id": test_ids, "Predictions": predictions})
    return predictions_df

In [53]:
# Load and split train.csv
dataset_address = "./datathon-dataset/processed-data/train_data_cleaned.csv"
target_column = "degerlendirme_puani"
test_dataset_address = "./datathon-dataset/processed-data/test_data_cleaned.csv"
X_train, X_val, X_test, y_train, y_val, y_test, ids_train, ids_val, ids_test = (
    load_and_split_train_data(dataset_address, target_column, "id")
)

In [54]:
# Optimize using HyperOpt
best_models = hyperopt_optimization(X_train, y_train, X_val, y_val)

Optimizing Ridge...
100%|██████████| 50/50 [01:02<00:00,  1.24s/trial, best loss: 8.77262349419707] 
Best params for Ridge: {'regressor__alpha': 97.02651413141194}
Optimizing GAM...
100%|██████████| 50/50 [03:02<00:00,  3.65s/trial, best loss: 7.5553384831467545]
Best params for GAM: {'feature_selection__k': 20, 'regressor__lam': 0.01724780874664662, 'regressor__n_splines': 12.0}


In [55]:
# Evaluate the best models on the test set
best_model_name = None
best_rmse = float("inf")
for model_name, best_params in best_models.items():
    print(f"\nEvaluating {model_name} on test set...")
    rmse = test_best_model(best_params, model_name, X_train, y_train, X_test, y_test)
    if rmse < best_rmse:
        best_rmse = rmse
        best_model_name = model_name

print(f"\nBest model is {best_model_name} with RMSE: {best_rmse}")


Evaluating Ridge on test set...
Test RMSE for Ridge: 8.841124040674602

Evaluating GAM on test set...
Test RMSE for GAM: 7.5463824425387305

Best model is GAM with RMSE: 7.5463824425387305


In [None]:
test_data, test_ids = load_test_data(test_dataset_address, "id")
best_params = best_models[best_model_name]

In [None]:
test_data

Unnamed: 0.1,Unnamed: 0,basvuru_yili,cinsiyet,dogum_yeri,ikametgah_sehri,universite_adi,universite_turu,burslu_ise_burs_yuzdesi,burs_aliyor_mu?,bölüm,...,anne_calisma_durumu,anne_sektor,baba_egitim_durumu,baba_calisma_durumu,baba_sektor,kardes_sayisi,ingilizce_biliyor_musunuz?,ingilizce_seviyeniz?,dogum_yili,dogum_ayi
0,0,2023,1,5,5,199.0,0,0,0,347.0,...,1,2,2,1,3,2.0,1,0.0,2002,6
1,1,2023,1,42,42,14.0,0,0,0,432.0,...,0,0,1,1,2,3.0,1,0.0,2004,5
2,2,2023,0,34,34,179.0,1,100,1,44.0,...,0,0,4,0,0,0.0,1,0.0,2002,4
3,3,2023,0,47,47,171.0,0,0,0,228.0,...,0,0,3,1,2,4.0,0,0.0,2003,3
4,4,2023,0,55,34,78.0,0,0,0,230.0,...,1,1,4,1,1,1.0,1,0.0,2002,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11044,11044,2023,1,66,6,13.0,0,0,0,111.0,...,0,0,2,0,0,0.0,1,0.0,2002,1
11045,11045,2023,1,42,42,180.0,0,0,0,159.0,...,0,0,1,1,1,3.0,0,0.0,2001,9
11046,11046,2023,0,6,6,146.0,0,0,0,104.0,...,0,0,4,1,3,2.0,1,0.0,2004,6
11047,11047,2023,1,2,34,50.0,0,0,0,9.0,...,0,0,2,1,3,2.0,1,0.0,2001,10


In [None]:
predictions = predict_on_test_data(
    best_params, best_model_name, X_train, y_train, test_data, test_ids
)

In [None]:
predictions

Unnamed: 0,id,Predictions
0,0,36.346371
1,1,37.746964
2,2,40.939720
3,3,34.311832
4,4,49.334766
...,...,...
11044,11044,36.306202
11045,11045,34.778908
11046,11046,38.024315
11047,11047,37.819725


In [None]:
predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11049 entries, 0 to 11048
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           11049 non-null  int64  
 1   Predictions  11049 non-null  float32
dtypes: float32(1), int64(1)
memory usage: 129.6 KB


In [None]:
predictions["Predictions"] = predictions["Predictions"].astype("float64")

In [None]:
predictions.rename(columns={"Predictions": "Degerlendirme Puani"}, inplace=True)

In [None]:
# Save predictions to CSV
pd.DataFrame(predictions).to_csv("predictions.csv", index=False)
print("Predictions saved to predictions.csv")

Predictions saved to predictions.csv
