In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import scipy as sp
import implicit
import bisect

import feature_utils

In [2]:
data_root = Path('data/')
converted_data_path = data_root / 'data_converted'
features_dir = Path('user_features/')
features_root = Path('feature_transformers/')

In [3]:
CAT_FEATURES = [
    'region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'url_host', 
    'cpe_type_cd', 'cpe_model_os_type', 'part_of_day'
]

In [4]:
n_urls = 199683
n_users = 415317

In [5]:
# Loading
cat_feature_enumerators = {
    fn: feature_utils.CatNumerator.load((features_root / fn).with_suffix('.json')) 
    for fn in CAT_FEATURES
}
price_scaler = feature_utils.ZeroOneScaler.load(features_root / 'price.json')

In [6]:
from importlib import reload
reload(feature_utils)

<module 'feature_utils' from '/home/andy/mts_2023/feature_utils.py'>

# Categorical statistics

In [6]:
stat_features = ['region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'cpe_type_cd']

In [55]:
stat_calculators = {fn: feature_utils.CatStatSummarizer(n_users, len(cat_feature_enumerators[fn])) for fn in stat_features}

for part in feature_utils.read_dir(converted_data_path):
    request_cnts = part['request_cnt'].values
    user_ids = part['user_id'].values
    for fn, fc in stat_calculators.items():
        fc.update(user_ids, part[fn].values, request_cnts)

In [58]:
for fn, fc in stat_calculators.items():
    np.savez(features_dir / f'{fn}.npz', top=fc.get_top_cats(), numb=fc.get_cat_numbs())

In [7]:
cat_stats = {fn: np.load(features_dir / f'{fn}.npz') for fn in stat_features}

# Mean targets

In [12]:
users_gt = pd.read_parquet(data_root / 'public_train.pqt')

user_genders = np.full(n_users, -1.)
user_ages = np.full(n_users, -1.)

gender_mask = ~users_gt['is_male'].isna() & (users_gt['is_male'] != 'NA')
user_genders[users_gt['user_id'][gender_mask]] = users_gt['is_male'][gender_mask].astype(np.int32)

age_mask = ~users_gt['age'].isna()
user_ages[users_gt['user_id'][age_mask]] = users_gt['age'][age_mask].astype(np.int32)

In [19]:
cat_features = ['region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'cpe_type_cd']
user_cats = {
    fn: np.load(features_dir / f'{fn}.npz')['top']
    for fn in cat_features
}

In [17]:
global_mean_gender = users_gt['is_male'][gender_mask].astype(np.float32).mean()
global_mean_age = users_gt['age'].mean()
global_mean_gender, global_mean_age

(0.5119889974594116, 38.78977329545665)

In [41]:
SMOOTHING_WEIGHT = 16

cat_mean_genders = {
    fn: feature_utils.KeyedMeanCalculator(
        sums = np.full(ucs.max() + 1, SMOOTHING_WEIGHT * global_mean_gender, np.float32),
        counters=np.full(ucs.max() + 1, SMOOTHING_WEIGHT, np.int32)
    )
    for fn, ucs in user_cats.items()
}
cat_mean_ages = {
    fn: feature_utils.KeyedMeanCalculator(
        sums = np.full(ucs.max() + 1, SMOOTHING_WEIGHT * global_mean_age, np.float32),
        counters=np.full(ucs.max() + 1, SMOOTHING_WEIGHT, np.int32)
    )
    for fn, ucs in user_cats.items()
}
user_age_contributions = np.zeros(n_users, np.int32)
user_gender_contributions = np.zeros(n_users, np.int32)

for part in feature_utils.read_dir(converted_data_path):
    part_users = part['user_id'].values
    part_genders = user_genders[part_users]
    part_gender_mask = part_genders >= 0
    part_ages = user_ages[part_users]
    part_age_mask = part_ages >= 0
    part_request_cnts = part['request_cnt'].values
    
    np.add.at(user_gender_contributions, part_users[part_gender_mask], part_request_cnts[part_gender_mask])
    np.add.at(user_age_contributions, part_users[part_age_mask], part_request_cnts[part_age_mask])
    
    for fn in cat_features:
        feature_values = user_cats[fn][part_users]
        cat_mean_genders[fn].update(
            feature_values[part_gender_mask], 
            part_genders[part_gender_mask], 
            part_request_cnts[part_gender_mask]
        )
        cat_mean_ages[fn].update(
            feature_values[part_age_mask], 
            part_ages[part_age_mask], 
            part_request_cnts[part_age_mask]
        )

  0%|          | 0/10 [00:00<?, ?it/s]

In [46]:
for fn in cat_features:
    cat_mean_gender = cat_mean_genders[fn].get(
        user_cats[fn], vals_to_exclude=np.maximum(user_genders, 0), multipliers_to_exclude=user_gender_contributions)
    cat_mean_age = cat_mean_ages[fn].get(
        user_cats[fn], vals_to_exclude=np.maximum(user_ages, 0), multipliers_to_exclude=user_age_contributions)
    np.save(features_dir / f'{fn}_mean_gender.npy', cat_mean_gender)
    np.save(features_dir / f'{fn}_mean_age.npy', cat_mean_age)

# URL mean targets

In [14]:
mean_gender_calculator = feature_utils.KeyedMeanCalculator(n_urls)
mean_age_calculator = feature_utils.KeyedMeanCalculator(n_urls)

for part in feature_utils.read_dir(converted_data_path):
    part_genders = user_genders[part['user_id'].values]
    part_gender_mask = part_genders >= 0
    mean_gender_calculator.update(
        part['url_host'][part_gender_mask].values, 
        part_genders[part_gender_mask], 
        part['request_cnt'][part_gender_mask].values
    )
    
    part_ages = user_ages[part['user_id'].values]
    part_age_mask = part_ages >= 0
    mean_age_calculator.update(
        part['url_host'][part_age_mask].values, 
        part_ages[part_age_mask], 
        part['request_cnt'][part_age_mask].values
    )

In [15]:
interaction_counters = sp.sparse.coo_matrix((n_users, n_urls), dtype=np.int64)

for part in feature_utils.read_dir(converted_data_path):
    interaction_counters += sp.sparse.coo_matrix(
        (part['request_cnt'], (part['user_id'], part['url_host'])),
        shape=(n_users, n_urls)
    )

  0%|          | 0/10 [00:00<?, ?it/s]

## For users

In [12]:
mean_mean_genders_calculator = feature_utils.KeyedMeanCalculator(n_users)
mean_mean_ages_calculator = feature_utils.KeyedMeanCalculator(n_users)

for part in feature_utils.read_dir(converted_data_path):
    global_interactions = interaction_counters[part['user_id'], part['url_host']].A.ravel()
    
    part_ages = user_ages[part['user_id'].values]
    valid_age_mask = part_ages >= 0
    age_contributions = np.maximum(0, part_ages)
    url_mean_ages = mean_age_calculator.get(
        part['url_host'], 
        vals_to_exclude=age_contributions * valid_age_mask, multipliers_to_exclude=global_interactions * valid_age_mask, 
        default=-1
    )
    valid_age_mask = url_mean_ages >= 0
    mean_mean_ages_calculator.update(
        part['user_id'][valid_age_mask], url_mean_ages[valid_age_mask], 
        global_interactions[valid_age_mask]
    )
    
    part_genders = user_genders[part['user_id'].values]
    valid_gender_mask = part_genders >= 0
    gender_contributions = np.maximum(0, part_genders)
    url_mean_genders = mean_gender_calculator.get(
        part['url_host'], 
        vals_to_exclude=gender_contributions * valid_gender_mask, multipliers_to_exclude=global_interactions * valid_gender_mask, 
        default=-1
    )
    valid_gender_mask = url_mean_genders >= 0
    mean_mean_genders_calculator.update(
        part['user_id'][valid_gender_mask], url_mean_genders[valid_gender_mask], 
        global_interactions[valid_gender_mask]
    )

  0%|          | 0/10 [00:00<?, ?it/s]

In [53]:
mean_mean_ages_calculator.save(features_dir / 'interpolated_age')
mean_mean_genders_calculator.save(features_dir / 'interpolated_gender')

## For interactions
Per URL mean target with corrections on seen users to mitigate overfitting

In [18]:
interactions_counters_file = np.load('interactions/interactions_with_counters.npz', allow_pickle=True)
interactions_sets = interactions_counters_file['unique_urls']
interactions_counters = interactions_counters_file['interaction_counters']

In [43]:
interactions_mean_ages, interactions_mean_genders = [], []
for u, (int_set, int_counters) in enumerate(zip(interactions_sets, interactions_counters)):
    if user_ages[u] < 0:
        hist_mean_ages = mean_age_calculator.get(int_set)
    else:
        hist_mean_ages = mean_age_calculator.get(
            int_set, vals_to_exclude=user_ages[u], multipliers_to_exclude=int_counters)
    interactions_mean_ages.append(hist_mean_ages)
    
    if user_genders[u] < 0:
        hist_mean_genders = mean_gender_calculator.get(int_set)
    else:
        hist_mean_genders = mean_gender_calculator.get(
            int_set, vals_to_exclude=user_genders[u], multipliers_to_exclude=int_counters)
    interactions_mean_genders.append(hist_mean_genders)

interactions_mean_ages = np.array(interactions_mean_ages, dtype=object)
interactions_mean_genders = np.array(interactions_mean_genders, dtype=object)

In [44]:
np.save('interactions/url_mean_ages.npy', interactions_mean_ages)
np.save('interactions/url_mean_genders.npy', interactions_mean_genders)

# Per user counters

In [90]:
users_request_counters = np.zeros(n_users, np.int64)
users_session_counters = np.zeros(n_users, np.int64)

for part in feature_utils.read_dir(converted_data_path):
    np.add.at(users_request_counters, part['user_id'].values, part['request_cnt'].values)
    np.add.at(users_session_counters, part['user_id'].values, 1)

  0%|          | 0/10 [00:00<?, ?it/s]

In [91]:
np.save(features_dir / 'total_requests.npy', users_request_counters)
np.save(features_dir / 'sessions_number.npy', users_session_counters)

# Per user averages

In [12]:
manufacturers_short = [c.split(' ')[0] for c in cat_feature_enumerators['cpe_manufacturer_name'].cats]
manufacturers_renumeration = np.arange(len(manufacturers_short))
for i, mn in enumerate(manufacturers_short):
    if mn in manufacturers_short[:i]:
        manufacturers_renumeration[i] = manufacturers_short.index(mn, 0, i)

In [14]:
np.save(features_dir / 'manufacturers_renumeration.npy', manufacturers_renumeration)

In [42]:
N_POD, N_POW = 4, 7
user_price_mean_calc = feature_utils.KeyedMeanCalculator(n_users)
model_price_mean_calc = feature_utils.KeyedMeanCalculator(len(cat_feature_enumerators['cpe_model_name']))
manufacturer_price_mean_calc = feature_utils.KeyedMeanCalculator(len(cat_feature_enumerators['cpe_manufacturer_name']))
rps_mean_calc = feature_utils.KeyedMeanCalculator(n_users)
parts_of_day_mean_calcs = {i: feature_utils.KeyedMeanCalculator(n_users) for i in range(N_POD)}
parts_of_week_mean_calcs = {i: feature_utils.KeyedMeanCalculator(n_users) for i in range(N_POW)}

for part in feature_utils.read_dir(converted_data_path):
    user_ids = part['user_id'].values
    prices = part['price'].values
    prices_mask = ~np.isnan(prices)
    prices = prices[prices_mask]
    user_price_mean_calc.update(user_ids[prices_mask], prices)
    model_price_mean_calc.update(part['cpe_model_name'].values[prices_mask], prices)
    manufacturer_price_mean_calc.update(manufacturers_renumeration[part['cpe_manufacturer_name'].values[prices_mask]], prices)
    rps_mean_calc.update(user_ids, part['request_cnt'].values)
    for pod, calc in parts_of_day_mean_calcs.items():
        calc.update(user_ids, part['part_of_day'].values == pod)
    for pow_, calc in parts_of_week_mean_calcs.items():
        calc.update(user_ids, part['date'].values % N_POW == pow_)

  0%|          | 0/10 [00:00<?, ?it/s]

In [34]:
user_price_mean_calc.save(features_dir / 'user_price_mean')
model_price_mean_calc.save(features_dir / 'model_price_mean')
manufacturer_price_mean_calc.save(features_dir / 'manufacturer_price_mean')
rps_mean_calc.save(features_dir / 'rps_mean')

for pod, calc in parts_of_day_mean_calcs.items():
    calc.save(features_dir / f'part_of_day_{pod}')

In [44]:
for pod, calc in parts_of_week_mean_calcs.items():
    calc.save(features_dir / f'part_of_week_{pod}')

In [20]:
user_price_mean_calc = feature_utils.KeyedMeanCalculator.load(features_dir / 'user_price_mean.npz')
model_price_mean_calc = feature_utils.KeyedMeanCalculator.load(features_dir / 'model_price_mean.npz')

In [36]:
user_ids = np.arange(n_users)

In [37]:
user_mean_prices = user_price_mean_calc.get(user_ids, default=-1)
unk_mask = user_mean_prices == -1
unk_mask.mean()

0.02577067637491362

In [38]:
user_mean_prices[unk_mask] = model_price_mean_calc.get(cat_stats['cpe_model_name']['top'][user_ids[unk_mask]], default=-1)
unk_mask = user_mean_prices == -1
unk_mask.mean()

0.014776664571881237

In [39]:
user_mean_prices[unk_mask] = manufacturer_price_mean_calc.get(
    manufacturers_renumeration[cat_stats['cpe_manufacturer_name']['top'][user_ids[unk_mask]]], 
    default=-1
)
unk_mask = user_mean_prices == -1
unk_mask.mean()

0.0007512333952137764

In [40]:
user_mean_prices[unk_mask] = manufacturer_price_mean_calc.get_global()

In [41]:
np.save(features_dir / 'user_mean_prices.npy', user_mean_prices)

# Active days share

In [6]:
min_day, max_day = float('inf'), float('-inf')

In [7]:
for part in feature_utils.read_dir(converted_data_path):
    min_day = min(min_day, part['date'].min())
    max_day = max(max_day, part['date'].max())

  0%|          | 0/10 [00:00<?, ?it/s]

In [8]:
n_days = max_day - min_day + 1

In [9]:
activity_mask = np.zeros((n_users, n_days), dtype=bool)

In [10]:
for part in feature_utils.read_dir(converted_data_path):
    activity_mask[part['user_id'], part['date'] - min_day] = True

  0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
days_numbers = activity_mask * np.arange(1, n_days + 1).astype(np.short)

In [12]:
last_active_days = days_numbers.max(axis=1)

In [13]:
first_active_days = (days_numbers + (n_days + 1) * ~activity_mask).min(axis=1)

In [14]:
active_days_numbers = activity_mask.sum(axis=1)

In [19]:
active_days_shares = active_days_numbers / (last_active_days - first_active_days + 1)

In [25]:
np.save(features_dir / 'active_days_shares.npy', active_days_shares)

# Hosts stats

In [16]:
seen_hosts = sp.sparse.lil_matrix((n_users, len(cat_feature_enumerators['url_host'])), dtype=bool)

In [17]:
for part in feature_utils.read_dir(converted_data_path):
    seen_hosts[part['user_id'], part['url_host']] = True

  0%|          | 0/10 [00:00<?, ?it/s]

In [18]:
seen_hosts = seen_hosts.tocsr()

In [31]:
visited_hosts_numbers = seen_hosts.sum(axis=1).A.ravel()

In [33]:
np.save(features_dir / 'visited_hosts_numbers.npy', visited_hosts_numbers)

# URL features

In [7]:
url_visit_counters = np.zeros(n_urls, np.int32)

for part in feature_utils.read_dir(converted_data_path):
    np.add.at(url_visit_counters, part['url_host'].values, 1)

  0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
np.save('url_features/visit_counts.npy', url_visit_counters)

In [9]:
url_visit_log_counters = np.log(url_visit_counters)

In [10]:
np.save('url_features/visit_log_counts.npy', url_visit_log_counters)