<a href="https://colab.research.google.com/github/Batboldsanghi/Advanced-Lane/blob/main/Excluding_8th_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install catboost
!pip install scikit-learn
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 78 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Enabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (569 kB)
[

In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [4]:
colab = True

if colab:
    from google.colab import drive
    drive.mount('/content/drive')

    !pip install -q kaggle
    !mkdir ~/.kaggle
    !cp /content/drive/MyDrive/keys/kaggle.json ~/.kaggle/ 
    !chmod 600 ~/.kaggle/kaggle.json

Mounted at /content/drive


In [6]:
!cp /content/drive/MyDrive/code/utils_rfm.py .

from utils_rfm import plot_feature_importance

In [7]:
if colab:
    !kaggle datasets download -d huseyincot/amex-agg-data-pickle
    !unzip amex-agg-data-pickle.zip

Downloading amex-agg-data-pickle.zip to /content
 99% 2.75G/2.76G [00:26<00:00, 130MB/s]
100% 2.76G/2.76G [00:27<00:00, 110MB/s]
Archive:  amex-agg-data-pickle.zip
  inflating: test_agg.pkl            
  inflating: train_agg.pkl           


In [9]:
if colab:
    train = pd.read_pickle("train_agg.pkl", compression="gzip")
    test = pd.read_pickle("test_agg.pkl", compression="gzip")

In [10]:
for col in test.columns:
    if test[col].dtype=='float16':
        train[col] = train[col].astype('float32').round(decimals=2).astype('float16')
        test[col] = test[col].astype('float32').round(decimals=2).astype('float16')

# Competition Metric

In [11]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [12]:
features = test.columns.to_list()
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
cat_features = [f"{cf}_last" for cf in cat_features]
le_encoder = LabelEncoder()
for categorical_feature in cat_features:
    train[categorical_feature] = le_encoder.fit_transform(train[categorical_feature])
    test[categorical_feature] = le_encoder.transform(test[categorical_feature])

# My Part

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [14]:
train_y = pd.DataFrame(train["target"])
train_x = train.drop("target", axis=1)

X_train_valid, X_test , y_train_valid, y_test = \
    train_test_split(train_x, train_y, test_size=0.20, random_state=42,stratify=train_y)
X_train, X_valid , y_train, y_valid = \
    train_test_split(X_train_valid, y_train_valid, test_size=0.20, random_state=42,stratify=y_train_valid)

In [15]:
len(X_train)/len(train_x)

0.6399993027000761

In [16]:
from catboost import metrics
from catboost import Pool
import shap

train_pool = Pool(
    data=X_train, 
    label=y_train
)

validation_pool = Pool(
    data=X_valid, 
    label=y_valid
)

test_pool = Pool(
    data=X_test, 
    label=y_test
)

In [None]:
%%time

cb_clf = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=5,
    border_count=100,
    l2_leaf_reg=30,
    #task_type="GPU",
    custom_metric=[metrics.PRAUC(),metrics.F1(), metrics.Precision(), metrics.Recall()] # metrics.AUC(),
)

cb_clf.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=500,
    #plot=True,
)

In [None]:
preds = cb_clf.predict_proba(X_test)[:, 1]

y_pred=pd.DataFrame(columns=['prediction'])
y_pred["prediction"] = preds
val_score = amex_metric(y_test.reset_index(drop=True), y_pred)
print(f"Amex metric: {val_score}")