In [3]:
%load_ext autoreload
%autoreload 2



import pandas as pd
import numpy as np
from pathlib import Path
import logging

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s :: %(funcName)s :: %(message)s')

pd.set_option('display.max_rows', 20000)

from ccf.preprocess import get_bins

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
def merge_cats_due_to_quantity(
    features_path: Path,
    field_name: str,
    min_count_in_cat: int
) -> pd.DataFrame:
    features = pd.read_csv(features_path)
    logging.debug("0. Features load - complete.")
    
    features = features.astype({field_name: 'int32'})
    counts_df = features[field_name].value_counts().to_frame()
    cats_for_merging = np.array(counts_df.loc[counts_df[field_name] < min_count_in_cat].index)
    logging.debug("1. Create cats_for_merging - complete.")
    
    fields_value = np.array(features[field_name])
    features[field_name] = list(np.where(
        np.isin(fields_value, cats_for_merging), 
        -1, 
        fields_value
    ))
    logging.debug("2. Convert cats - complete.")
    return features

In [9]:
features_2 = merge_cats_due_to_quantity(
    features_path = Path('../data/full_no_tail/features.csv'),
    field_name = 'content_id',
    min_count_in_cat = 3_000
)

2020-12-05 12:26:53,725 :: merge_cats_due_to_quantity :: 0. Features load - complete.
2020-12-05 12:26:55,835 :: merge_cats_due_to_quantity :: 1. Create cats_for_merging - complete.
2020-12-05 12:26:56,825 :: merge_cats_due_to_quantity :: 2. Convert cats - complete.


In [10]:
features_2.shape

(2500000, 108)

In [11]:
features_2.to_csv(Path('../data/full_no_tail/features_contid.csv'), index = False)

In [12]:
del features_2

# New binarization

In [69]:
X = pd.read_csv('../data/base/X.csv')

In [70]:
data = pd.merge(
    X,
    features_2,
    left_on = ['id'],
    right_on = ['id'],
    how = 'left'
)

In [71]:
dict_ = get_bins(
    name = 'content_id',
    data = data[['content_id', 'target']],
    list_label = list(data['target']),
    min_weight_fraction_leaf = 0.0,
    min_samples_leaf = 2,
    min_samples_split = 2,
    max_depth = None,
)

In [72]:
dict_[1]

{1.5: 0.62,
 25.5: 0.5,
 110.0: 1.0,
 180.0: 0.33,
 193.0: 1.0,
 210.5: 0.5,
 217.5: 0.71,
 223.5: 0.0,
 238.5: 0.5,
 247.5: 0.5,
 281.5: 0.75,
 292.5: 0.5,
 295.0: 1.0,
 303.5: 0.67,
 314.5: 1.0,
 349.0: 0.33,
 373.5: 1.0,
 377.5: 0.6,
 412.5: 1.0,
 444.5: 0.0,
 455.0: 0.5,
 473.0: 1.0,
 502.0: 0.25,
 523.5: 1.0,
 579.0: 0.33,
 584.5: 1.0,
 590.0: 0.25,
 615.5: 0.5,
 646.0: 0.5,
 662.5: 0.33,
 683.0: 0.0,
 735.0: 1.0,
 759.0: 0.5,
 829.0: 1.0,
 844.5: 0.5,
 870.5: 0.5,
 993.0: 1.0,
 995.0: 0.5,
 999.0: 1.0,
 1015.5: 0.0,
 1032.0: 0.5,
 1059.0: 1.0,
 1084.0: 0.0,
 1111.5: 1.0,
 1113.0: 0.0,
 1137.5: 0.75,
 1179.0: 0.5,
 1191.5: 1.0,
 1202.0: 0.67,
 1260.5: 0.0,
 1272.5: 0.5,
 1280.5: 1.0,
 1286.5: 0.5,
 1305.5: 1.0,
 1337.0: 0.5,
 1650.5: 0.67,
 2063.5: 0.0,
 2329.0: 1.0,
 2593.5: 0.47,
 2594.5: 0.13,
 2770.5: 0.53,
 2946.5: 0.0,
 2947.5: 0.21,
 3174.5: 0.31,
 3269.5: 0.5,
 3363.5: 0.2,
 3364.5: 0.69,
 3474.5: 0.29,
 3596.5: 0.0,
 3634.0: 1.0,
 3638.0: 0.0,
 3662.5: 0.5,
 3746.0: 1.0,
