In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv(r'C:\Users\Emincan\Desktop\Playground\train.csv')
# test = pd.read_csv(r'C:\Users\Emincan\Desktop\Playground\test.csv')
sub = pd.read_csv(r'C:\Users\Emincan\Desktop\Playground\sample_submission.csv')

train.head()

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.drop('id' , axis =1 , inplace = True)

In [None]:
# simple preprocessing for categorical variable!
# train = pd.get_dummies(train, columns=['Sex'])

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

train['Sex'] = le.fit_transform(train['Sex'])

train['Sex'] = train['Sex'].astype('category')

In [None]:
train.info()

In [None]:
train.describe().T

In [None]:
train['Age'].nunique()

In [None]:
train['Age'].value_counts()

In [None]:
plt.figure(figsize=(14,8) ,dpi=100)

sns.heatmap(train.corr(numeric_only=True) , annot = True);

# Utils

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score , confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score , roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
from imblearn.over_sampling import SMOTE , ADASYN



class fonks:
    """
    Class'ın amacı:
    - Sıkça kullanılacak fonksiyonları oluşturmak ve kolayca çağırmak.
    - Daha düzenli ve değiştirilebilir bir programlama yapmak.
    """
    
    label = 'Fertility'
    
    def overSample(X , y , tactic = 'smote'):
        if tactic == 'smote':
            smote = SMOTE()
            X_resampled , y_resampled = smote.fit_resample(X , y)
            return X_resampled , y_resampled
        elif tactic == 'adasyn':
            adasyn = ADASYN()
            X_resampled , y_resampled = adasyn.fit_resample(X , y)
            return X_resampled , y_resampled
            
    
    def trainTest(X , y , test_size = 0.2 , stratify = None):
        """
        Alınan datayı istenen oranda Train & Test şeklinde bölüp X_train , X_test , y_train , y_test adındaki 4 değişken döndürür.

        Parameters
        ----------
        data : pandas.DataFrame
            Train & Test olarak bölünecek veri seti.
        label : str, optional
            Hedef değişkenin adı, by default 'Knowledge'.
        test_size : float, optional
            Test veri setinin boyutu, by default 0.2.

        Returns
        -------
        tuple
            X_train, X_test, y_train, y_test
        """

        if stratify == 'y' :
            X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=test_size , random_state=53 , stratify=y)
        else :
            X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=test_size , random_state=53)
        
        return X_train, X_test, y_train,y_test


    def simple_scores(y_train, y_train_pred, y_test, y_pred, name='Model' , task = 'reg'):
        """
        Model ismini, Train ve Test'e ait tahmin ve gerçek değerleri alıp, Alınan acc, f1 ve precision skorlarını yazdırır.
        Hem Regression hem Classification taskleri için tasarlanmıştır.
        
        Parameters
        ----------
        y_train : pandas.DataFrame
            Train verisinin gerçek değerleri.
        y_train_pred : pandas.DataFrame
            Train verisine ait tahmin değerleri.
        y_test : pandas.DataFrame
            Test verisinin gerçek değerleri.
        y_pred : pandas.DataFrame
            Test verisine ait tahmin değerleri.
        name : str optional
            Tahmini alınan modelin ismi, by default 'Model'.
        task : str optional, default 'reg'
            Görevin type'ı 'reg' or 'class'
            
        Returns
        -------
        print
            Train Acc , Train F1 , Train Precision , Test Acc , Test F1 , Test Precision
        """
        
        
        if task == 'class':
            print(f'{name} Train Accuracy : {accuracy_score(y_train , y_train_pred)}')
            print(f'{name} Train Recall : {recall_score(y_train, y_train_pred, pos_label=1)}')
            print(f'{name} Train Precision : {precision_score(y_train , y_train_pred, pos_label=1)}')
            print(f'{name} Train F1 : {f1_score(y_train , y_train_pred , pos_label=1)}')
            
            
            print("\n****************\n")
            
            print(f'{name} Test Accuracy : {accuracy_score(y_test , y_pred)}')
            print(f'{name} Test Recall : {recall_score(y_test, y_pred, pos_label=1)}')
            print(f'{name} Test Precision : {precision_score(y_test , y_pred, pos_label=1)}')
            print(f'{name} Test F1 : {f1_score(y_test , y_pred, pos_label=1)}')
        elif task == 'reg':
            print(f'{name} Train R2 : {r2_score(y_train , y_train_pred)}')
            print(f'{name} Train Mae : {mean_absolute_error(y_train, y_train_pred)}')
            print(f'{name} Train RMSE : {mean_squared_error(y_train , y_train_pred)**0.5}')
            print(f'{name} Train MSE : {mean_squared_error(y_train , y_train_pred)}')
            
            
            print("\n****************\n")
            
            print(f'{name} Test R2 : {r2_score(y_test , y_pred)}')
            print(f'{name} Test Mae : {mean_absolute_error(y_test, y_pred)}')
            print(f'{name} Test RMSE : {mean_squared_error(y_test , y_pred)**0.5}')
            print(f'{name} Test MSE : {mean_squared_error(y_test , y_pred)}')
        
    def get_r2_score(y_true, y_pred):
        return r2_score(y_true,y_pred)
    def get_mae_score(y_true, y_pred):
        return mean_absolute_error(y_true,y_pred)
    def get_rmse_score(y_true, y_pred):
        return mean_squared_error(y_true,y_pred)**0.5
    def get_mse_score(y_true, y_pred):
        return mean_squared_error(y_true,y_pred)
        
        
    def get_acc_score(y_true, y_pred):
        return accuracy_score(y_true , y_pred)    
    def get_f1_score(y_true, y_pred):
        return f1_score(y_true , y_pred , pos_label=1)
    
    def get_recall(y_true, y_pred):
        return recall_score(y_true, y_pred, pos_label=1)
    
    def get_roc_auc(y_true, y_pred):
        return roc_auc_score(y_true, y_pred, pos_label=1)
    
    
    def eval_metric(y_train, y_train_pred, y_test, y_pred, name='Model'):
        """
        Model ismini, Train ve Test'e ait tahmin ve gerçek değerleri alıp, Confusion_matrix ve Classification_report'u yazdırır..
        Sadece Classification taskleri için tasarlanmıştır.
        
        Parameters
        ----------
        y_train : pandas.DataFrame
            Train verisinin gerçek değerleri.
        y_train_pred : pandas.DataFrame
            Train verisine ait tahmin değerleri.
        y_test : pandas.DataFrame
            Test verisinin gerçek değerleri.
        y_test_pred : pandas.DataFrame
            Test verisine ait tahmin değerleri.
        name : str optional
            Tahmini alınan modelin ismi, by default 'Model'.
        task : str optional, default 'reg'
            Görevin type'ı 'reg' or 'class'            
            
        Returns
        -------
        print
            Train Confusion_matrix , Train Classification_report , Test Confusion_matrix , Test Classification_report
        """
        
        print(f"Test_Set {name}")
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print()
        print(f"Train_Set {name}")
        print(confusion_matrix(y_train, y_train_pred))
        print(classification_report(y_train, y_train_pred))         
        
    

    def train_test_df(y_train, y_train_pred, y_test, y_pred, name='Model' , task='reg'):
        """
        Model ismini, Train ve Test'e ait tahmin ve gerçek değerleri alıp, Accuracy, Recall, Precision,  yazdırır..
        Hem Regression hem Classification taskleri için tasarlanmıştır.
        
        Parameters
        ----------
        y_train : pandas.DataFrame
            Train verisinin gerçek değerleri.
        y_train_pred : pandas.DataFrame
            Train verisine ait tahmin değerleri.
        y_test : pandas.DataFrame
            Test verisinin gerçek değerleri.
        y_test_pred : pandas.DataFrame
            Test verisine ait tahmin değerleri.
        name : str optional
            Tahmini alınan modelin ismi, by default 'Model'.
        task : str optional, default 'reg'
            Datamızın task'i, 'reg' or 'class'
            
        Returns
        -------
        print
            Train Accuracy , Train Recall , Train Precision , Train F1 , Test Accuracy , Test Recall , Test Precision , Test F1
        """
        
        
        if task == 'classification':
            scores = {name+"_train": {"Accuracy" : accuracy_score(y_train, y_train_pred),
            "Recall" : recall_score(y_train, y_train_pred , pos_label=1),
            "Precision" : precision_score(y_train, y_train_pred , pos_label=1),
            "F1" : np.sqrt(f1_score(y_train, y_train_pred , pos_label=1))},
                    
            name+"_test": {"Accuracy" : accuracy_score(y_test, y_pred),
            "Recall" : recall_score(y_test, y_pred , pos_label=1),
            "Precision" : precision_score(y_test, y_pred , pos_label=1),
            "F1" : np.sqrt(f1_score(y_test, y_pred , pos_label=1))}}
            return pd.DataFrame(scores)
        
        
        
        if task == 'reg':
            
            scores = {name+"_train": {"R2" : r2_score(y_train, y_train_pred),
            "mae" : mean_absolute_error(y_train, y_train_pred),
            "mse" : mean_squared_error(y_train, y_train_pred),
            "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred))},
                    
            name+"_test": {"R2" : r2_score(y_test, y_pred),
            "mae" : mean_absolute_error(y_test, y_pred),
            "mse" : mean_squared_error(y_test, y_pred),
            "rmse" : np.sqrt(mean_squared_error(y_test, y_pred))}}
            return pd.DataFrame(scores)

    # Fonksiyonun özeti
    def summary(df):
        # Print the shape of the DataFrame
        print(f'data shape: {df.shape}')  
        # Create a summary DataFrame
        summ = pd.DataFrame(df.dtypes, columns=['data type'])
        # Calculate the number of missing values
        summ['#missing'] = df.isnull().sum().values 
        # Calculate the percentage of missing values
        summ['%missing'] = df.isnull().sum().values / len(df)* 100
        # Calculate the number of unique values
        summ['#unique'] = df.nunique().values
        # Create a descriptive DataFrame
        desc = pd.DataFrame(df.describe(include='all').transpose())
        # Add the minimum, maximum, and first three values to the summary DataFrame
        summ['min'] = desc['min'].values
        summ['max'] = desc['max'].values
        summ['first value'] = df.loc[0].values
        summ['second value'] = df.loc[1].values
        summ['third value'] = df.loc[2].values
        
        # Return the summary DataFrame
        return summ
    
    def plot_correlation_heatmap(df: pd.core.frame.DataFrame, title_name: str='Train correlation') -> None:
        corr = df.corr()  
        fig, axes = plt.subplots(figsize=(14, 8))
        mask = np.zeros_like(corr)
        mask[np.triu_indices_from(mask)] = True
        sns.heatmap(corr, mask=mask, linewidths=.5, cmap='YlOrRd', annot=True)
        plt.title(title_name)
        plt.show()

    # # plot_correlation_heatmap(original, 'Original Dataset Correlation')
    # plot_correlation_heatmap(train, 'Train Dataset Correlation')

In [None]:
fonks.plot_correlation_heatmap(train)

# Categoric nan Fill

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

def nanFiller(df, label , nan_cols=None):
    if nan_cols is None:
        # 'thal' sütunundaki eksik değerleri tahmin etmek için kullanacağımız veri setini ayarlayın
        train_data = df.dropna(subset=[label])  # 'thal' sütununda eksik değerleri içermeyen gözlemleri kullanın

        # Özellikleri ve hedef sütunu belirleyin
        features = train_data.drop(label, axis=1)
        target = train_data[label]

        # Modeli oluşturun ve eğitin
        model = GradientBoostingClassifier()
        model.fit(features, target)

        # 'thal' sütunundaki eksik değerleri tahmin etmek için kullanacağımız veri setini ayarlayın
        test_data = df[df[label].isnull()].drop(label, axis=1)

        # 'thal' sütunundaki eksik değerleri tahmin edin
        predictions = model.predict(test_data)

        # 'thal' sütunundaki eksik değerleri tahmin edilen değerlerle doldurun
        df.loc[df[label].isnull(), label] = predictions

    elif nan_cols is not None:
        # 'ca' sütunundaki eksik değerleri tahmin etmek için kullanacağımız veri setini ayarlayın
        train_data = df.dropna(subset=[label])  # 'ca' sütununda eksik değerleri içermeyen gözlemleri kullanın

        # Özellikleri ve hedef sütunu belirleyin
        features = train_data.drop(nan_cols + [label], axis=1)
        target = train_data[label]

        # Modeli oluşturun ve eğitin
        model = GradientBoostingClassifier()
        model.fit(features, target)

        # Tahmin edilecek veri setini oluşturun
        test_data = df[df[label].isnull()].drop(nan_cols + [label], axis=1)

        # 'ca' sütunundaki eksik değerleri tahmin edin
        predictions = model.predict(test_data)

        # 'ca' sütunundaki eksik değerleri tahmin edilen değerlerle doldurun
        df.loc[df[label].isnull(), label] = predictions

    # Sonuçları döndürün
    return df

# X and y

In [None]:
X = train.drop('Age' , axis = 1)
y = train['Age']

# OverSample

In [None]:
X_oversized , y_oversized = fonks.overSample(X,y)

In [None]:
X_oversized = nanFiller(X_oversized , 'Sex')

In [None]:
X.shape

In [None]:
X_oversized.isna().sum()

In [None]:
y.value_counts(dropna=False)

In [None]:
y_oversized.value_counts(dropna=False)

# Train | Test Split

In [None]:
X_train , X_test , y_train , y_test = fonks.trainTest(X_oversized , y_oversized , test_size= 0.2 )

# X_train , X_test , y_train , y_test = fonks.trainTest(X , y , test_size= 0.2 )

# Models

### Linear Model

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_train , y_train)

lr_train_pred = np.round(lr.predict(X_train))
lr_pred = np.round(lr.predict(X_test))

fonks.simple_scores(y_train, lr_train_pred , y_test , lr_pred , 'LR')

In [None]:
lr_r2 = fonks.get_r2_score(y_test , lr_pred)
lr_mae = fonks.get_mae_score(y_test , lr_pred)
lr_rmse = fonks.get_rmse_score(y_test , lr_pred)
# nb_rocauc = fonks.get_roc_auc(y_test , nb_pred , "NB")
# nb_rocauc = roc_auc_score(y_test, nb_pred, multi_class='ovo')

lr_df = fonks.train_test_df(y_train, lr_train_pred , y_test , lr_pred , 'LR')
lr_df

### Ridge Model

In [None]:
from sklearn.linear_model import RidgeCV

rdg_grid = RidgeCV(alphas=np.arange(0.001,100,1) , cv = 5)

rdg_grid.fit(X_train , y_train)

print(f"Ridge Best Score : {rdg_grid.best_score_}")
print(f"Ridge Best Alpha : {rdg_grid.alpha_}")

In [None]:
from sklearn.linear_model import Ridge

rdg = Ridge(alpha = 4)

rdg.fit(X_train , y_train)

rdg_train_pred = np.round(rdg.predict(X_train))
rdg_pred = np.round(rdg.predict(X_test))

fonks.simple_scores(y_train, rdg_train_pred , y_test , rdg_pred , 'Ridge')

In [None]:
rdg_r2 = fonks.get_r2_score(y_test , rdg_pred)
rdg_mae = fonks.get_mae_score(y_test , rdg_pred)
rdg_rmse = fonks.get_rmse_score(y_test , rdg_pred)
# nb_rocauc = fonks.get_roc_auc(y_test , nb_pred , "NB")
# nb_rocauc = roc_auc_score(y_test, nb_pred, multi_class='ovo')

rdg_df = fonks.train_test_df(y_train, rdg_train_pred , y_test , rdg_pred , 'Ridge')
all_df = pd.concat([lr_df , rdg_df] , axis = 1)
all_df

### Lasso

In [None]:
from sklearn.linear_model import LassoCV

las_grid = LassoCV(alphas=np.arange(0.001,100,1) , cv = 5)

las_grid.fit(X_train , y_train)

# print(f"Lasso Best Score : {las_grid.best_score_}")
print(f"Lasso Best Alpha : {las_grid.alpha_}")

In [None]:
# 0.001 is too low for C value. So Lasso is not good model for that dataset.

### SVR

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import dpctl
from sklearnex import patch_sklearn, config_context , unpatch_sklearn

patch_sklearn()

with config_context(target_offload = 'gpu:0'):
    svr_base = SVR()

    # params = {'kernel' : ['linear' , 'poly' , 'rbf' ] , 'degree' : [2,3] , 'gamma' : ['scale' , 'auto'] , 'C' : np.arange(0.01,100,10) }
    
    params = {'kernel' : ['rbf' ] , 'gamma' : ['scale'] , 'C' : [500,600,700,800]}

    svr_grid = GridSearchCV(svr_base , params , scoring='neg_mean_absolute_error' , cv = 3)
    svr_grid.fit(X_train , y_train)

print(f'Best parameters: {svr_grid.best_params_}')
print(f'Best score: {svr_grid.best_score_:.2f}')

unpatch_sklearn()

In [None]:
from sklearn.svm import SVR

svr = SVR(kernel = 'rbf' , C=500 , gamma='scale')
# svr = svr_grid.best_estimator_
svr.fit(X_train , y_train)

svr_train_pred = np.round(svr.predict(X_train))
svr_pred = np.round(svr.predict(X_test))

fonks.simple_scores(y_train, svr_train_pred , y_test , svr_pred , 'SVR')

In [None]:
svr_r2 = fonks.get_r2_score(y_test , svr_pred)
svr_mae = fonks.get_mae_score(y_test , svr_pred)
svr_rmse = fonks.get_rmse_score(y_test , svr_pred)

svr_df = fonks.train_test_df(y_train, svr_train_pred , y_test , svr_pred , 'SVR')
all_df = pd.concat([all_df , svr_df] , axis = 1)
all_df

### Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import dpctl
from sklearnex import patch_sklearn, config_context , unpatch_sklearn

patch_sklearn()

with config_context(target_offload = 'gpu:0'):
    dt_base = DecisionTreeRegressor(random_state= 53)

    params = {'max_depth' : np.arange(2,9) , 'min_samples_split' : np.arange(2,5)}

    dt_grid = GridSearchCV(dt_base , params , cv = 3 , scoring= 'neg_mean_absolute_error' )
    dt_grid.fit(X_train , y_train)

print(f"DT Best Params is : {dt_grid.best_params_}")
print(f"DT Best Score is : {dt_grid.best_score_}")

unpatch_sklearn()

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=8 , min_samples_split=3 , random_state= 53)
dt.fit(X_train , y_train)

dt_train_pred = np.round(dt.predict(X_train))
dt_pred = np.round(dt.predict(X_test))

fonks.simple_scores(y_train, dt_train_pred , y_test , dt_pred , 'DT')

In [None]:
dt_r2 = fonks.get_r2_score(y_test , dt_pred)
dt_mae = fonks.get_mae_score(y_test , dt_pred)
dt_rmse = fonks.get_rmse_score(y_test , dt_pred)

dt_df = fonks.train_test_df(y_train, dt_train_pred , y_test , dt_pred , 'DT')
all_df = pd.concat([all_df , dt_df] , axis = 1)
all_df

### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import dpctl
from sklearnex import patch_sklearn, config_context , unpatch_sklearn

patch_sklearn()

with config_context(target_offload = 'gpu:0'):
    rf_base = RandomForestRegressor()

    params = {'n_estimators' : np.arange(100,600,100) , 'max_depth' : np.append(np.arange(2,8) , None) , 'max_features' : np.append(np.arange(2,6) , None) , 'min_samples_split' : [2,3]}

    rf_grid = GridSearchCV(rf_base , params , scoring= 'neg_mean_absolute_error' , cv = 3)
    rf_grid.fit(X_train , y_train)

print(f"RF Best Params is : {rf_grid.best_params_}")
print(f"RF Best Score is : {rf_grid.best_score_}")

unpatch_sklearn()

In [None]:
from sklearn.ensemble import RandomForestRegressor

# rf = RandomForestRegressor(n_estimators = 230 , max_depth=6 , min_samples_split=2 , max_features=5)
# rf = RandomForestRegressor(n_estimators = 500 , max_depth=7 , min_samples_split=2 , max_features=None)
rf = rf_grid.best_estimator_
rf.fit(X_train , y_train)

rf_train_pred = rf.predict(X_train)
rf_pred = rf.predict(X_test)

fonks.simple_scores(y_train, rf_train_pred , y_test , rf_pred , 'RF')

In [None]:
rf_r2 = fonks.get_r2_score(y_test , rf_pred)
rf_mae = fonks.get_mae_score(y_test , rf_pred)
rf_rmse = fonks.get_rmse_score(y_test , rf_pred)

rf_df = fonks.train_test_df(y_train, rf_train_pred , y_test , rf_pred , 'RF')
all_df = pd.concat([all_df , rf_df] , axis = 1)
all_df

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

patch_sklearn()

with config_context(target_offload = 'gpu:0'):
    gb_base = GradientBoostingRegressor()

    params = {'n_estimators' : np.arange(100,600,100) , 'learning_rate' : [0.01,0.1,1] , 'subsample' : [0.8 , 1] , 'max_depth' : np.arange(2,7)}

    gb_grid = GridSearchCV(gb_base , params , scoring= 'neg_mean_absolute_error' , cv = 3)
    gb_grid.fit(X_train , y_train)

print(f"GB Best Params is : {gb_grid.best_params_}")
print(f"GB Best Score is : {gb_grid.best_score_}")

unpatch_sklearn()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# gb = GradientBoostingRegressor(n_estimators= 240 , max_depth=10 , learning_rate= 0.01 , subsample= 0.8 )
# gb = GradientBoostingRegressor(n_estimators= 500 , max_depth=6 , learning_rate= 0.01 , subsample= 0.8 )
gb = gb_grid.best_estimator_
gb.fit(X_train , y_train)

gb_train_pred = gb.predict(X_train)
gb_pred = gb.predict(X_test)

fonks.simple_scores(y_train, gb_train_pred , y_test , gb_pred , 'GB')

In [None]:
gb_r2 = fonks.get_r2_score(y_test , gb_pred)
gb_mae = fonks.get_mae_score(y_test , gb_pred)
gb_rmse = fonks.get_rmse_score(y_test , gb_pred)

gb_df = fonks.train_test_df(y_train, gb_train_pred , y_test , gb_pred , 'GB')
all_df = pd.concat([all_df , gb_df] , axis = 1)
all_df

### XGB Model

In [None]:
from xgboost import XGBRegressor

patch_sklearn()

with config_context(target_offload = 'gpu:0'):
    xgb_base = XGBRegressor(tree_method = 'gpu_hist')

    params = {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 150 , 200, 300 , 400 , 500, 600],
        'max_depth': [3, 4, 5, 6 , 7],
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.6, 0.7, 0.8],
        'reg_alpha': [0, 0.01, 0.05],
        'reg_lambda': [0, 0.01, 0.05],
    }

    params2 = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'n_estimators': [100, 200, 300, 400, 500],
        'gamma': [0, 1, 5, 10],
        'min_child_weight': [1, 3, 5, 7],
        'scale_pos_weight': [1, 2, 3, 4, 5]
    }

    xgb_rnd = RandomizedSearchCV(xgb_base , params , scoring= 'neg_mean_absolute_error' , cv = 3 , n_iter= 150)
    xgb_rnd.fit(X_train , y_train)

print(f"XGB Best Params is : {xgb_rnd.best_params_}")
print(f"XGB Best Score is : {xgb_rnd.best_score_}")

unpatch_sklearn()

In [None]:
from xgboost import XGBRegressor

# xgb = XGBRegressor(n_estimators = 500 , min_child_weight = 5 , max_depth = 3 , learning_rate = 0.1 , gamma = 0, colsample_bytree = 0.7 , subsample= 0.7 , reg_alpha = 0.05 , reg_lambda=0.01)
# xgb = XGBRegressor(n_estimators = 400 , min_child_weight = 5 , max_depth = 8 , learning_rate = 0.01 , gamma = 0.1 , colsample_bytree = 0.8 , subsample = 0.9 , reglambda = 0 , reg_alpha = 0.05)
xgb = xgb_rnd.best_estimator_
xgb.fit(X_train , y_train)

xgb_train_pred = xgb.predict(X_train)
xgb_pred = xgb.predict(X_test)

fonks.simple_scores(y_train, xgb_train_pred , y_test , xgb_pred , 'XGB')

In [None]:
xgb_r2 = fonks.get_r2_score(y_test , xgb_pred)
xgb_mae = fonks.get_mae_score(y_test , xgb_pred)
xgb_rmse = fonks.get_rmse_score(y_test , xgb_pred)

xgb_df = fonks.train_test_df(y_train, xgb_train_pred , y_test , xgb_pred , 'XGB')
all_df = pd.concat([all_df , xgb_df] , axis = 1)
all_df

### LightGBM

In [None]:
from lightgbm import LGBMRegressor

patch_sklearn()

with config_context(target_offload='gpu:0'):
        lgb_base = LGBMRegressor(device = 'gpu')

        params = {
                'n_estimators': [100, 200, 300 , 400 , 500 , 600],
                'max_depth': [3, 5, 7 , 6 , 7, 8],
                'learning_rate': [1,0.1,0.01],
                'subsample': [0.7,0.8, 0.9, 1.0],
                'colsample_bytree': [0.8, 0.9, 1.0],
                }

        lgb_rnd = RandomizedSearchCV(lgb_base , params , scoring='neg_mean_absolute_error' , cv = 3,  n_iter= 150)
        lgb_rnd.fit(X_train,y_train)


print(f'Best parameters: {lgb_rnd.best_params_}')
print(f'Best score: {lgb_rnd.best_score_:.2f}')

unpatch_sklearn()

In [None]:
from lightgbm import LGBMRegressor

# lgb = LGBMRegressor(n_estimators= 100 , learning_rate = 0.1 , max_depth = 6 , subsample = 0.9 , colsample_bytree = 1)
# lgb = LGBMRegressor(n_estimators= 100 , learning_rate = 0.1 , max_depth = 8 , subsample = 0.8 , colsample_bytree = 0.8)
lgb = lgb_rnd.best_estimator_
lgb.fit(X_train , y_train)

lgb_train_pred = lgb.predict(X_train)
lgb_pred = lgb.predict(X_test)

fonks.simple_scores(y_train, lgb_train_pred , y_test , lgb_pred , 'LGB')

In [None]:
lgb_r2 = fonks.get_r2_score(y_test , lgb_pred)
lgb_mae = fonks.get_mae_score(y_test , lgb_pred)
lgb_rmse = fonks.get_rmse_score(y_test , lgb_pred)

lgb_df = fonks.train_test_df(y_train, xgb_train_pred , y_test , lgb_pred , 'LGB')
all_df = pd.concat([all_df , lgb_df] , axis = 1)
all_df

### CatBoost

In [None]:
params = {'learning_rate': 0.03,
          'objective':'MAE',
          'depth': 6,
          'early_stopping_rounds':1000,
          'iterations': 10000,
          'use_best_model': True,
          'eval_metric': "RMSE",
          'random_state': 986,
          'allow_writing_files': False,
          'thread_count':24
          }

In [None]:
# X_train['X1 transaction date'] = X_train['X1 transaction date'].astype(int).astype('category')
# X_test['X1 transaction date'] = X_test['X1 transaction date'].astype(int).astype('category')

In [None]:
from catboost import CatBoostRegressor

cb = CatBoostRegressor(**params,
                       cat_features= ['Sex_F' , 'Sex_I' , 'Sex_M']
                        )
cb.fit(X_train,y_train,
        eval_set=[(X_test,y_test)],
        verbose=500)

cb_train_pred = np.round(cb.predict(X_train))
cb_pred = np.round(cb.predict(X_test))

fonks.simple_scores(y_train, cb_train_pred , y_test , cb_pred , 'CB')

In [None]:
cb_r2 = fonks.get_r2_score(y_test , cb_pred)
cb_mae = fonks.get_mae_score(y_test , cb_pred)
cb_rmse = fonks.get_rmse_score(y_test , cb_pred)

cb_df = fonks.train_test_df(y_train, cb_train_pred , y_test , cb_pred , 'CB')
all_df = pd.concat([all_df , cb_df] , axis = 1)
all_df

### NN Model

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mse')

model.fit(X_train, y_train, epochs=10)

model_train_pred = model.predict(X_train)
model_pred = model.predict(X_test)

fonks.simple_scores(y_train, model_train_pred , y_test , model_pred , 'Model')

In [None]:
all_df

In [None]:
cb_pred

In [None]:
lgb_pred

In [None]:
xgb_pred

In [None]:
gb_pred

In [None]:
rf_pred

In [None]:
svr_pred

In [None]:
sameWeight_preds

In [None]:
differentWeight_preds

# Combine Same Weight Model

In [None]:
sameWeight_preds = np.round((cb_pred + lgb_pred + xgb_pred + gb_pred + rf_pred + svr_pred) / 6)

sameWeight_r2 = fonks.get_r2_score(y_test , sameWeight_preds)
sameWeight_mae = fonks.get_mae_score(y_test , sameWeight_preds)
sameWeight_rmse = fonks.get_rmse_score(y_test , sameWeight_preds)

# Combine Different Weight Model

In [None]:
differentWeight_preds = np.round(cb_pred * 0.4 + lgb_pred * 0.1  + xgb_pred * 0.25 + gb_pred * 0.05 + rf_pred * 0.05 + svr_pred * 0.15)

differentWeight_r2 = fonks.get_r2_score(y_test , differentWeight_preds)
differentWeight_mae = fonks.get_mae_score(y_test , differentWeight_preds)
differentWeight_rmse = fonks.get_rmse_score(y_test , differentWeight_preds)

# Simple Ensemble Same Weight Model

In [None]:
simple_sameWeight_preds = np.round((cb_pred + xgb_pred + svr_pred) / 3)

simple_sameWeight_r2 = fonks.get_r2_score(y_test , simple_sameWeight_preds)
simple_sameWeight_mae = fonks.get_mae_score(y_test , simple_sameWeight_preds)
simple_sameWeight_rmse = fonks.get_rmse_score(y_test , simple_sameWeight_preds)

# Simple Ensemble Different Weight Model

In [None]:
simple_differentWeight_preds = np.round((cb_pred * 0.49 + xgb_pred * 0.35 + svr_pred * 0.16))

simple_differentWeight_r2 = fonks.get_r2_score(y_test , simple_differentWeight_preds)
simple_differentWeight_mae = fonks.get_mae_score(y_test , simple_differentWeight_preds)
simple_differentWeight_rmse = fonks.get_rmse_score(y_test , simple_differentWeight_preds)

# Compare Models

In [None]:
compare = pd.DataFrame({"Model": ["LinearRegression", "Ridge", "SVR", "DecisionTree", "RandomForest", "GradientBoosting",
                                 "XGBoost", "LGBM" , "CatBoost" , 'SameWeightModel' , 'DifferentWeightModel' , 'SimpleSameWeightModel' , 'SimpleDifferentWeightModel'],
                        "R2": [lr_r2, rdg_r2, svr_r2, dt_r2, rf_r2 , gb_r2, xgb_r2, lgb_r2 , cb_r2 , sameWeight_r2 , differentWeight_r2 , simple_sameWeight_r2 , simple_differentWeight_r2 ],
                        "MAE": [lr_mae, rdg_mae, svr_mae, dt_mae, rf_mae, gb_mae, xgb_mae, lgb_mae , cb_mae , sameWeight_mae , differentWeight_mae , simple_sameWeight_mae , simple_differentWeight_r2],
                        "RMSE": [lr_rmse, rdg_rmse, svr_rmse, dt_rmse, rf_rmse, gb_rmse, xgb_rmse, lgb_rmse , cb_rmse , sameWeight_rmse , differentWeight_rmse , simple_sameWeight_rmse , simple_differentWeight_r2],
                        })

def labels(ax):
    for p in ax.patches:
        width = p.get_width()                        # get bar length
        ax.text(width,                               # set the text at 1 unit right of the bar
                p.get_y() + p.get_height() / 2,      # get Y coordinate + X coordinate / 2
                '{:1.3f}'.format(width),             # set variable to display, 2 decimals
                ha = 'left',                         # horizontal alignment
                va = 'center')                       # vertical alignment
    
plt.figure(figsize=(14,10))
plt.subplot(311)
compare = compare.sort_values(by="R2", ascending=False)
ax=sns.barplot(x="R2", y="Model", data=compare, palette="Blues_d")
labels(ax)

plt.subplot(312)
compare = compare.sort_values(by="MAE", ascending=True)
ax=sns.barplot(x="MAE", y="Model", data=compare, palette="Blues_d")
labels(ax)

plt.subplot(313)
compare = compare.sort_values(by="RMSE", ascending=True)
ax=sns.barplot(x="RMSE", y="Model", data=compare, palette="Blues_d")
labels(ax)
plt.show()

In [None]:
# Done

# Compare Preds

In [None]:
from sklearn.metrics import jaccard_score
from scipy.stats import pearsonr

def pearson_compare(df1 , df2 , feature = 'Age'):
    df1_values = df1[feature].values
    df2_values = df2[feature].values
    
    # Pearson korelasyon katsayısı hesaplama
    correlation, _ = pearsonr(df1_values, df2_values)
    
    print(f"Pearson Korelasyon Katsayısı:", correlation)
    
 
def jaccard_compare_preds(pred1, pred2):
    # Jaccard benzerlik katsayısı hesaplama
    jaccard_similarity = jaccard_score(pred1, pred2 , average='weighted')
    
    print(f"Jaccard Benzerlik Katsayısı:", jaccard_similarity)  
    
def jaccard_compare_dfs(df1 , df2 , feature = 'Age'):
    df1_values = df1[feature].values
    df2_values = df2[feature].values
    
    # Jaccard benzerlik katsayısı hesaplama
    jaccard_similarity = jaccard_score(df1_values, df2_values , average='weighted')
    
    print(f"Jaccard Benzerlik Katsayısı:", jaccard_similarity)  


In [None]:
jaccard_compare_preds(simple_differentWeight_preds , simple_sameWeight_preds)

In [None]:
jaccard_compare_preds(simple_differentWeight_preds , differentWeight_preds)

In [None]:
sub['Age'] = simple_differentWeight_preds