In [1]:
# ================================================================================
# FEATURE ENGINEERING 
# ================================================================================

import pandas as pd
import numpy as np
import pickle
from collections import Counter, defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

print("\n" + "="*80)
print(" FEATURE ENGINEERING")
print("="*80)

# ================================================================================
# 1. –ó–ê–ì–†–£–ó–ö–ê –î–ê–ù–ù–´–•
# ================================================================================

print("\n[1/6] –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ preprocessing...")

# –Ω–∞–∑–≤–∞–Ω–∏—è —Ñ–∞–π–ª–æ–≤
train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')
complete_df = pd.read_csv('complete_dataset.csv')

with open('holdout_dict.pkl', 'rb') as f:
    holdout_dict = pickle.load(f)

#  —Å—Ç—Ä—É–∫—Ç—É—Ä–∞ embeddings
with open('book_embeddings.pkl', 'rb') as f:
    book_embeddings_dict = pickle.load(f)
    # –ò—Å–ø–æ–ª—å–∑—É–µ–º –∫–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ (768-dim)
    book_embeddings = book_embeddings_dict['book_combined']

with open('user_embeddings.pkl', 'rb') as f:
    user_embeddings_dict = pickle.load(f)
    user_embeddings_train = user_embeddings_dict['train']
    user_embeddings_test = user_embeddings_dict['test']

# –°–æ–∑–¥–∞–µ–º book_tags_clean –∏–∑ complete_dataset
print(" –°–æ–∑–¥–∞–Ω–∏–µ book_tags_clean...")
book_tags_clean = {}
for _, row in complete_df[['book_id', 'tags_text']].drop_duplicates('book_id').iterrows():
    if pd.notna(row['tags_text']) and row['tags_text'].strip() != '':
        book_tags_clean[row['book_id']] = row['tags_text'].split()
    else:
        book_tags_clean[row['book_id']] = []

print(f"  –ó–∞–≥—Ä—É–∂–µ–Ω–æ:")
print(f"     ‚Ä¢ Train: {len(train_df):,} –æ—Ü–µ–Ω–æ–∫")
print(f"     ‚Ä¢ Test: {len(test_df):,} –æ—Ü–µ–Ω–æ–∫")
print(f"     ‚Ä¢ Book embeddings: {len(book_embeddings):,} –∫–Ω–∏–≥ √ó 768")
print(f"     ‚Ä¢ User embeddings (train): {len(user_embeddings_train):,} √ó 768")
print(f"     ‚Ä¢ User embeddings (test): {len(user_embeddings_test):,} √ó 768")
print(f"     ‚Ä¢ Book tags: {len(book_tags_clean):,} –∫–Ω–∏–≥")
print(f"     ‚Ä¢ Holdout users: {len(holdout_dict):,}")

# ================================================================================
# 2. USER FEATURES (4 –ø—Ä–∏–∑–Ω–∞–∫–∞)
# ================================================================================

print("\n" + "="*80)
print("[2/6] –°–û–ó–î–ê–ù–ò–ï USER FEATURES")
print("="*80)

print("\n  –í—ã—á–∏—Å–ª–µ–Ω–∏–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π...")

# 2.1. –ë–∞–∑–æ–≤–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
user_stats = train_df.groupby('user_id').agg({
    'rating': ['mean', 'count'], 'book_id': 'nunique'
}).reset_index()

user_stats.columns = ['user_id', 'avg_user_rating', 'ratings_count', 'unique_books_count']  

print(f"     avg_user_rating, ratings_count, unique_books_count")

# 2.2. Tag vocabulary size (—Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ —Ç–µ–≥–æ–≤ –≤ –∏—Å—Ç–æ—Ä–∏–∏)
print("\n  –í—ã—á–∏—Å–ª–µ–Ω–∏–µ tag_vocab_size...")

user_tag_vocab = {}

for user_id, group in tqdm(train_df.groupby('user_id'), desc="     Processing users"):
    user_tag_set = set()
    
    for book_id in group['book_id']:
        if book_id in book_tags_clean:
            user_tag_set.update(book_tags_clean[book_id])
    
    user_tag_vocab[user_id] = len(user_tag_set)

user_stats['tag_vocab_size'] = user_stats['user_id'].map(user_tag_vocab).fillna(0)

print(f"     tag_vocab_size")

# 2.3. Activity score (–∫–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –º–µ—Ç—Ä–∏–∫–∞ –∞–∫—Ç–∏–≤–Ω–æ—Å—Ç–∏)
print("\n  –í—ã—á–∏—Å–ª–µ–Ω–∏–µ activity_score...")

def minmax_scale(series):
    """Min-max –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è"""
    min_val = series.min()
    max_val = series.max()
    if max_val == min_val:
        return pd.Series(np.ones(len(series)), index=series.index)
    return (series - min_val) / (max_val - min_val)

# –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º ratings_count –∏ tag_vocab_size
norm_ratings = minmax_scale(user_stats['ratings_count'])
norm_vocab = minmax_scale(user_stats['tag_vocab_size'])

# Activity = 0.7 * norm(ratings) + 0.3 * norm(vocab)
user_stats['activity_score'] = 0.7 * norm_ratings + 0.3 * norm_vocab

print(f"      activity_score")

# 2.4. –°–µ–≥–º–µ–Ω—Ç–∞—Ü–∏—è –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π
print("\n  –°–µ–≥–º–µ–Ω—Ç–∞—Ü–∏—è –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π...")

def classify_user_segment(unique_books): 
    """–ö–ª–∞—Å—Å–∏—Ñ–∏—Ü–∏—Ä—É–µ—Ç –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –ø–æ –∞–∫—Ç–∏–≤–Ω–æ—Å—Ç–∏"""
    if unique_books < 5:
        return 'new'
    elif 5 <= unique_books <= 10:
        return 'inactive'
    elif 11 <= unique_books <= 60:
        return 'active'
    else:
        return 'very_active'

user_stats['segment'] = user_stats['unique_books_count'].apply(classify_user_segment)
# –£–¥–∞–ª—è–µ–º unique_books_count (–Ω—É–∂–µ–Ω –±—ã–ª —Ç–æ–ª—å–∫–æ –¥–ª—è —Å–µ–≥–º–µ–Ω—Ç–∞—Ü–∏–∏)
user_stats = user_stats.drop(columns=['unique_books_count'])

# –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–æ —Å–µ–≥–º–µ–Ω—Ç–∞–º
print(f"\n   –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ —Å–µ–≥–º–µ–Ω—Ç–∞–º:")
segment_counts = user_stats['segment'].value_counts()
for segment, count in segment_counts.items():
    pct = count / len(user_stats) * 100
    print(f"     ‚Ä¢ {segment:15s}: {count:>6,} ({pct:>5.1f}%)")

# –°–æ—Ö—Ä–∞–Ω—è–µ–º user features
user_stats.to_csv('user_features.csv', index=False)
print(f"\n   User features —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: user_features.csv")
print(f"     –ü—Ä–∏–∑–Ω–∞–∫–æ–≤: {len(user_stats.columns) - 1} (–∫—Ä–æ–º–µ user_id)")

# ================================================================================
# 3. BOOK FEATURES (3 –ø—Ä–∏–∑–Ω–∞–∫–∞)
# ================================================================================

print("\n" + "="*80)
print("[3/6] –°–û–ó–î–ê–ù–ò–ï BOOK FEATURES")
print("="*80)

print("\n  –í—ã—á–∏—Å–ª–µ–Ω–∏–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫ –∫–Ω–∏–≥...")

# 3.1. –ë–∞–∑–æ–≤–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
book_stats = train_df.groupby('book_id').agg({
    'rating': ['mean', 'count']
}).reset_index()

book_stats.columns = ['book_id', 'book_avg_rating', 'book_ratings_count']

print(f"      book_avg_rating, book_ratings_count")

# 3.2. Book popularity (–∫–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –º–µ—Ç—Ä–∏–∫–∞)
print("\n  –í—ã—á–∏—Å–ª–µ–Ω–∏–µ book_popularity...")

# –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º
norm_book_ratings = minmax_scale(book_stats['book_ratings_count'])
norm_book_avg = minmax_scale(book_stats['book_avg_rating'])

# Popularity = 0.7 * norm(ratings_count) + 0.3 * norm(avg_rating)
book_stats['book_popularity'] = 0.7 * norm_book_ratings + 0.3 * norm_book_avg

print(f"     book_popularity")

# –°–æ—Ö—Ä–∞–Ω—è–µ–º book features
book_stats.to_csv('book_features.csv', index=False)
print(f"\n  Book features —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: book_features.csv")
print(f"     –ü—Ä–∏–∑–Ω–∞–∫–æ–≤: {len(book_stats.columns) - 1} (–∫—Ä–æ–º–µ book_id)")

# ================================================================================
# 4. INTERACTION FEATURES (6 –ø—Ä–∏–∑–Ω–∞–∫–æ–≤) 
# ================================================================================

print("\n" + "="*80)
print("[4/6] –°–û–ó–î–ê–ù–ò–ï INTERACTION FEATURES")
print("="*80)

print("\n –≠—Ç–æ –∑–∞–π–º–µ—Ç –≤—Ä–µ–º—è (–º–Ω–æ–≥–æ –ø–∞—Ä user-book)...")
print(f"  –ë—É–¥–µ—Ç —Å–æ–∑–¥–∞–Ω–æ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è: {len(train_df):,} train + {len(test_df):,} test")

# –†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ 768
EMB_DIM = 768

# –§—É–Ω–∫—Ü–∏—è –ë–ï–ó data leakage –¥–ª—è train
def create_interaction_features_train(idx, user_id, book_id, train_df, book_tags_clean, 
                                       user_embeddings, book_embeddings):
    """
    –°–æ–∑–¥–∞–µ—Ç 6 –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏—è –¥–ª—è TRAIN
    –ò—Å–∫–ª—é—á–∞–µ—Ç —Ç–µ–∫—É—â—É—é –∫–Ω–∏–≥—É –∏–∑ –∏—Å—Ç–æ—Ä–∏–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è!
    """
    features = {}
    
    # --- Tag-based features (–ë–ï–ó —Ç–µ–∫—É—â–µ–π –∫–Ω–∏–≥–∏ –≤ –∏—Å—Ç–æ—Ä–∏–∏) ---
    # –ü–æ–ª—É—á–∞–µ–º –∏—Å—Ç–æ—Ä–∏—é —é–∑–µ—Ä–∞ –î–û —Ç–µ–∫—É—â–µ–π —Å—Ç—Ä–æ–∫–∏ (–ø–æ –∏–Ω–¥–µ–∫—Å—É)
    user_history = train_df[(train_df['user_id'] == user_id) & (train_df.index < idx)]
    
    user_tags = set()
    for bid in user_history['book_id']:
        if bid in book_tags_clean:
            user_tags.update(book_tags_clean[bid])
    
    book_tags = set(book_tags_clean.get(book_id, []))
    
    # –ü–µ—Ä–µ—Å–µ—á–µ–Ω–∏–µ —Ç–µ–≥–æ–≤
    intersection = user_tags & book_tags
    union = user_tags | book_tags
    
    features['tag_overlap_count'] = len(intersection)
    features['tag_overlap_ratio'] = len(intersection) / len(book_tags) if book_tags else 0
    features['tag_jaccard'] = len(intersection) / len(union) if union else 0
    features['history_similarity'] = 0.6 * features['tag_overlap_ratio'] + 0.4 * features['tag_jaccard']
    
    # --- Embedding-based features ---
    user_emb = user_embeddings.get(user_id, np.zeros(EMB_DIM))
    book_emb = book_embeddings.get(book_id, np.zeros(EMB_DIM))
    
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ –Ω—É–ª–µ–≤—ã—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤
    user_norm = np.linalg.norm(user_emb)
    book_norm = np.linalg.norm(book_emb)
    
    if user_norm == 0 or book_norm == 0:
        features['embedding_cosine_sim'] = 0.0
        features['embedding_euclidean_dist'] = 1.0  # –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ
    else:
        # –ö–æ—Å–∏–Ω—É—Å–Ω–∞—è –±–ª–∏–∑–æ—Å—Ç—å
        features['embedding_cosine_sim'] = np.dot(user_emb, book_emb) / (user_norm * book_norm)
        
        # –ò–°–ü–†–ê–í–õ–ï–ù–ò–ï 7: –ù–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω–Ω–æ–µ –µ–≤–∫–ª–∏–¥–æ–≤–æ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ
        # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –≤–µ–∫—Ç–æ—Ä—ã
        user_emb_norm = user_emb / user_norm
        book_emb_norm = book_emb / book_norm
        features['embedding_euclidean_dist'] = np.linalg.norm(user_emb_norm - book_emb_norm) / np.sqrt(2)
    
    return features

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è test (–∏—Å–ø–æ–ª—å–∑—É–µ–º –í–°–Æ –∏—Å—Ç–æ—Ä–∏—é train)
def create_interaction_features_test(user_id, book_id, user_tags_dict, book_tags_clean, 
                                      user_embeddings, book_embeddings):
    """
    –°–æ–∑–¥–∞–µ—Ç 6 –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏—è –¥–ª—è TEST
    –ò—Å–ø–æ–ª—å–∑—É–µ—Ç –ø–æ–ª–Ω—É—é –∏—Å—Ç–æ—Ä–∏—é –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –∏–∑ train
    """
    features = {}
    
    # --- Tag-based features ---
    user_tags = user_tags_dict.get(user_id, set())
    book_tags = set(book_tags_clean.get(book_id, []))
    
    intersection = user_tags & book_tags
    union = user_tags | book_tags
    
    features['tag_overlap_count'] = len(intersection)
    features['tag_overlap_ratio'] = len(intersection) / len(book_tags) if book_tags else 0
    features['tag_jaccard'] = len(intersection) / len(union) if union else 0
    features['history_similarity'] = 0.6 * features['tag_overlap_ratio'] + 0.4 * features['tag_jaccard']
    
    # --- Embedding-based features ---
    user_emb = user_embeddings.get(user_id, np.zeros(EMB_DIM))
    book_emb = book_embeddings.get(book_id, np.zeros(EMB_DIM))
    
    user_norm = np.linalg.norm(user_emb)
    book_norm = np.linalg.norm(book_emb)
    
    if user_norm == 0 or book_norm == 0:
        features['embedding_cosine_sim'] = 0.0
        features['embedding_euclidean_dist'] = 1.0
    else:
        features['embedding_cosine_sim'] = np.dot(user_emb, book_emb) / (user_norm * book_norm)
        
        user_emb_norm = user_emb / user_norm
        book_emb_norm = book_emb / book_norm
        features['embedding_euclidean_dist'] = np.linalg.norm(user_emb_norm - book_emb_norm) / np.sqrt(2)
    
    return features

# –°–æ–∑–¥–∞–µ–º interaction features –¥–ª—è TRAIN
print(f"\n  –°–æ–∑–¥–∞–Ω–∏–µ interaction features –¥–ª—è train...")

train_interactions = []

for idx, row in tqdm(train_df.iterrows(), total=len(train_df), desc="     Train"):
    features = create_interaction_features_train(
        idx,
        row['user_id'], 
        row['book_id'],
        train_df,
        book_tags_clean,
        user_embeddings_train,
        book_embeddings
    )
    
    # –î–æ–±–∞–≤–ª—è–µ–º –æ—Å–Ω–æ–≤–Ω—ã–µ –ø–æ–ª—è
    features['user_id'] = row['user_id']
    features['book_id'] = row['book_id']
    features['rating'] = row['rating']
    
    train_interactions.append(features)

train_features_df = pd.DataFrame(train_interactions)

print(f"  Train interaction features: {len(train_features_df):,}")

# –°–æ–∑–¥–∞–µ–º —Å–ª–æ–≤–∞—Ä—å user_tags –¥–ª—è TEST (–ø–æ–ª–Ω–∞—è –∏—Å—Ç–æ—Ä–∏—è train)
print(f"\n  –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ user tags –¥–ª—è test...")

user_tags_dict = {}

for user_id, group in tqdm(train_df.groupby('user_id'), desc="     User tags"):
    user_tags = set()
    for book_id in group['book_id']:
        if book_id in book_tags_clean:
            user_tags.update(book_tags_clean[book_id])
    user_tags_dict[user_id] = user_tags

print(f"  User tags –≥–æ—Ç–æ–≤—ã: {len(user_tags_dict):,} –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π")

# –°–æ–∑–¥–∞–µ–º interaction features –¥–ª—è TEST
print(f"\n  –°–æ–∑–¥–∞–Ω–∏–µ interaction features –¥–ª—è test...")

test_interactions = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="     Test"):
    features = create_interaction_features_test(
        row['user_id'], 
        row['book_id'],
        user_tags_dict,
        book_tags_clean,
        user_embeddings_test,  # –ò—Å–ø–æ–ª—å–∑—É–µ–º test embeddings!
        book_embeddings
    )
    
    features['user_id'] = row['user_id']
    features['book_id'] = row['book_id']
    features['rating'] = row['rating']
    
    test_interactions.append(features)

test_features_df = pd.DataFrame(test_interactions)

print(f"  Test interaction features: {len(test_features_df):,}")



 FEATURE ENGINEERING

[1/6] –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ preprocessing...
 –°–æ–∑–¥–∞–Ω–∏–µ book_tags_clean...
  –ó–∞–≥—Ä—É–∂–µ–Ω–æ:
     ‚Ä¢ Train: 874,496 –æ—Ü–µ–Ω–æ–∫
     ‚Ä¢ Test: 107,260 –æ—Ü–µ–Ω–æ–∫
     ‚Ä¢ Book embeddings: 10,000 –∫–Ω–∏–≥ √ó 768
     ‚Ä¢ User embeddings (train): 53,424 √ó 768
     ‚Ä¢ User embeddings (test): 35,659 √ó 768
     ‚Ä¢ Book tags: 10,000 –∫–Ω–∏–≥
     ‚Ä¢ Holdout users: 35,659

[2/6] –°–û–ó–î–ê–ù–ò–ï USER FEATURES

  –í—ã—á–∏—Å–ª–µ–Ω–∏–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π...
     avg_user_rating, ratings_count, unique_books_count

  –í—ã—á–∏—Å–ª–µ–Ω–∏–µ tag_vocab_size...


     Processing users: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 53424/53424 [00:04<00:00, 11887.91it/s]


     tag_vocab_size

  –í—ã—á–∏—Å–ª–µ–Ω–∏–µ activity_score...
      activity_score

  –°–µ–≥–º–µ–Ω—Ç–∞—Ü–∏—è –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π...

   –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ —Å–µ–≥–º–µ–Ω—Ç–∞–º:
     ‚Ä¢ new            : 25,571 ( 47.9%)
     ‚Ä¢ active         : 16,240 ( 30.4%)
     ‚Ä¢ inactive       :  8,343 ( 15.6%)
     ‚Ä¢ very_active    :  3,270 (  6.1%)

   User features —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: user_features.csv
     –ü—Ä–∏–∑–Ω–∞–∫–æ–≤: 5 (–∫—Ä–æ–º–µ user_id)

[3/6] –°–û–ó–î–ê–ù–ò–ï BOOK FEATURES

  –í—ã—á–∏—Å–ª–µ–Ω–∏–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫ –∫–Ω–∏–≥...
      book_avg_rating, book_ratings_count

  –í—ã—á–∏—Å–ª–µ–Ω–∏–µ book_popularity...
     book_popularity

  Book features —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: book_features.csv
     –ü—Ä–∏–∑–Ω–∞–∫–æ–≤: 3 (–∫—Ä–æ–º–µ book_id)

[4/6] –°–û–ó–î–ê–ù–ò–ï INTERACTION FEATURES

 –≠—Ç–æ –∑–∞–π–º–µ—Ç –≤—Ä–µ–º—è (–º–Ω–æ–≥–æ –ø–∞—Ä user-book)...
  –ë—É–¥–µ—Ç —Å–æ–∑–¥–∞–Ω–æ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è: 874,496 train + 107,260 test

  –°–æ–∑–¥–∞–Ω–∏–µ interaction feat

     Train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 874496/874496 [27:40<00:00, 526.64it/s]


  Train interaction features: 874,496

  –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ user tags –¥–ª—è test...


     User tags: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 53424/53424 [00:06<00:00, 7790.42it/s]


  User tags –≥–æ—Ç–æ–≤—ã: 53,424 –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π

  –°–æ–∑–¥–∞–Ω–∏–µ interaction features –¥–ª—è test...


     Test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 107260/107260 [00:09<00:00, 10794.18it/s]


  Test interaction features: 107,260


In [2]:

# ================================================================================
# 5. –û–ë–™–ï–î–ò–ù–ï–ù–ò–ï –í–°–ï–• –ü–†–ò–ó–ù–ê–ö–û–í
# ================================================================================

print("\n" + "="*80)
print("[5/6] –û–ë–™–ï–î–ò–ù–ï–ù–ò–ï –í–°–ï–• –ü–†–ò–ó–ù–ê–ö–û–í")
print("="*80)

print("\n  –ú–µ—Ä–¥–∂ —Å user –∏ book features...")

# Train
train_full = train_features_df.merge(user_stats, on='user_id', how='left')
train_full = train_full.merge(book_stats, on='book_id', how='left')

# Test
test_full = test_features_df.merge(user_stats, on='user_id', how='left')
test_full = test_full.merge(book_stats, on='book_id', how='left')

print(f"     Train: {train_full.shape}")
print(f"     Test: {test_full.shape}")

# –î–æ–±–∞–≤–ª—è–µ–º –ø—Ä–∏–∑–Ω–∞–∫–∏ –∏–∑ preprocessing
print("\n  –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –∏–∑ preprocessing...")

# –ü—Ä–∏–∑–Ω–∞–∫–∏ –∫–æ—Ç–æ—Ä—ã–µ –Ω—É–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å
extra_cols = ['language_code_encoded', 'year_normalized', 'publication_era', 'average_rating']

# –î–ª—è train
train_extra = train_df[['user_id', 'book_id'] + extra_cols].copy()
# –°–æ–∑–¥–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –∫–ª—é—á (–¥–ª—è —Å–ª—É—á–∞—è –¥—É–±–ª–∏–∫–∞—Ç–æ–≤)
train_extra['row_num'] = train_extra.groupby(['user_id', 'book_id']).cumcount()
train_full['row_num'] = train_full.groupby(['user_id', 'book_id']).cumcount()

train_full = train_full.merge(train_extra, on=['user_id', 'book_id', 'row_num'], how='left')
train_full = train_full.drop(columns=['row_num'])

# –î–ª—è test
test_extra = test_df[['user_id', 'book_id'] + extra_cols].copy()
test_extra['row_num'] = test_extra.groupby(['user_id', 'book_id']).cumcount()
test_full['row_num'] = test_full.groupby(['user_id', 'book_id']).cumcount()

test_full = test_full.merge(test_extra, on=['user_id', 'book_id', 'row_num'], how='left')
test_full = test_full.drop(columns=['row_num'])

print(f"     –î–æ–±–∞–≤–ª–µ–Ω–æ {len(extra_cols)} –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –∏–∑ preprocessing")
print(f"      Train final: {train_full.shape}")
print(f"     Test final: {test_full.shape}")



[5/6] –û–ë–™–ï–î–ò–ù–ï–ù–ò–ï –í–°–ï–• –ü–†–ò–ó–ù–ê–ö–û–í

  –ú–µ—Ä–¥–∂ —Å user –∏ book features...
     Train: (874496, 17)
     Test: (107260, 17)

  –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –∏–∑ preprocessing...
     –î–æ–±–∞–≤–ª–µ–Ω–æ 4 –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –∏–∑ preprocessing
      Train final: (874496, 21)
     Test final: (107260, 21)


In [3]:

# –ü—Ä–æ–≤–µ—Ä–∫–∞ –ø—Ä–æ–ø—É—Å–∫–æ–≤
print("\n  üîç –ü—Ä–æ–≤–µ—Ä–∫–∞ –ø—Ä–æ–ø—É—Å–∫–æ–≤...")
train_missing = train_full.isnull().sum()
test_missing = test_full.isnull().sum()

if train_missing.sum() > 0:
    print(f"       Train –ø—Ä–æ–ø—É—Å–∫–∏:")
    print(train_missing[train_missing > 0])
else:
    print(f"      Train: –ø—Ä–æ–ø—É—Å–∫–æ–≤ –Ω–µ—Ç")

if test_missing.sum() > 0:
    print(f"   Test –ø—Ä–æ–ø—É—Å–∫–∏ (cold start books):")
    print(test_missing[test_missing > 0])
    
    print(f"\n      –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ –ø—Ä–æ–ø—É—Å–∫–æ–≤...")
    
    # 1. book_avg_rating: –∏—Å–ø–æ–ª—å–∑—É–µ–º average_rating –∏–∑ Goodreads
    mask = test_full['book_avg_rating'].isna()
    if mask.sum() > 0:
        test_full.loc[mask, 'book_avg_rating'] = test_full.loc[mask, 'average_rating']
    
    # 2. –ï—Å–ª–∏ –∏ Goodreads –Ω–µ—Ç - –≥–ª–æ–±–∞–ª—å–Ω–æ–µ —Å—Ä–µ–¥–Ω–µ–µ
    test_full['book_avg_rating'].fillna(train_full['book_avg_rating'].median(), inplace=True)
    
    # 3. book_ratings_count = –º–∏–Ω–∏–º—É–º (–Ω–æ–≤–∞—è –∫–Ω–∏–≥–∞)
    test_full['book_ratings_count'].fillna(1, inplace=True)
    
    # 4. book_popularity: –ø–µ—Ä–µ—Å—á–∏—Ç—ã–≤–∞–µ–º –¥–ª—è —Ö–æ–ª–æ–¥–Ω—ã—Ö –∫–Ω–∏–≥
    mask = test_full['book_popularity'].isna()
    if mask.sum() > 0:
        # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω–æ train
        max_ratings = train_full['book_ratings_count'].max()
        min_rating = train_full['book_avg_rating'].min()
        max_rating = train_full['book_avg_rating'].max()
        
        norm_count = (test_full.loc[mask, 'book_ratings_count'] - 1) / max_ratings
        norm_rating = (test_full.loc[mask, 'book_avg_rating'] - min_rating) / (max_rating - min_rating)
        test_full.loc[mask, 'book_popularity'] = 0.7 * norm_count + 0.3 * norm_rating
    
    print(f"      –ó–∞–ø–æ–ª–Ω–µ–Ω–æ {test_missing[test_missing > 0].sum()} –ø—Ä–æ–ø—É—Å–∫–æ–≤")
    
    # –ü—Ä–æ–≤–µ—Ä—è–µ–º —á—Ç–æ –ø—Ä–æ–ø—É—Å–∫–æ–≤ –Ω–µ –æ—Å—Ç–∞–ª–æ—Å—å
    feature_cols = [
    # Interaction (6)
    'tag_overlap_count', 'tag_overlap_ratio', 'tag_jaccard', 
    'history_similarity', 'embedding_cosine_sim', 'embedding_euclidean_dist',
    # User (4) - –ë–ï–ó unique_books_count!
    'avg_user_rating', 'ratings_count', 'tag_vocab_size', 'activity_score',
    # Book (3)
    'book_avg_rating', 'book_ratings_count', 'book_popularity',
    # Preprocessing (4)
    'language_code_encoded', 'year_normalized', 'publication_era', 'average_rating'
]
    test_missing_after = test_full[feature_cols].isnull().sum()
    if test_missing_after.sum() > 0:
        print(f"       –û—Å—Ç–∞–ª–∏—Å—å –ø—Ä–æ–ø—É—Å–∫–∏:")
        print(test_missing_after[test_missing_after > 0])
    else:
        print(f"      –í—Å–µ –ø—Ä–æ–ø—É—Å–∫–∏ –∑–∞–ø–æ–ª–Ω–µ–Ω—ã!")
else:
    print(f"  Test: –ø—Ä–æ–ø—É—Å–∫–æ–≤ –Ω–µ—Ç")

# –°–æ—Ö—Ä–∞–Ω—è–µ–º
train_full.to_csv('train_features_full.csv', index=False)
test_full.to_csv('test_features_full.csv', index=False)

print(f"\n   –ü–æ–ª–Ω—ã–µ –¥–∞—Ç–∞—Å–µ—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã:")
print(f"     ‚Ä¢ train_features_full.csv")
print(f"     ‚Ä¢ test_features_full.csv")



  üîç –ü—Ä–æ–≤–µ—Ä–∫–∞ –ø—Ä–æ–ø—É—Å–∫–æ–≤...
      Train: –ø—Ä–æ–ø—É—Å–∫–æ–≤ –Ω–µ—Ç
   Test –ø—Ä–æ–ø—É—Å–∫–∏ (cold start books):
book_avg_rating       400
book_ratings_count    400
book_popularity       400
dtype: int64

      –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ –ø—Ä–æ–ø—É—Å–∫–æ–≤...
      –ó–∞–ø–æ–ª–Ω–µ–Ω–æ 1200 –ø—Ä–æ–ø—É—Å–∫–æ–≤
      –í—Å–µ –ø—Ä–æ–ø—É—Å–∫–∏ –∑–∞–ø–æ–ª–Ω–µ–Ω—ã!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_full['book_avg_rating'].fillna(train_full['book_avg_rating'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_full['book_ratings_count'].fillna(1, inplace=True)



   –ü–æ–ª–Ω—ã–µ –¥–∞—Ç–∞—Å–µ—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã:
     ‚Ä¢ train_features_full.csv
     ‚Ä¢ test_features_full.csv


In [4]:

# ================================================================================
# 6. –§–ò–ù–ê–õ–¨–ù–ê–Ø –°–¢–ê–¢–ò–°–¢–ò–ö–ê
# ================================================================================

print("\n" + "="*80)
print("[6/6] –§–ò–ù–ê–õ–¨–ù–ê–Ø –°–¢–ê–¢–ò–°–¢–ò–ö–ê")
print("="*80)


print(f"\n –°–æ–∑–¥–∞–Ω–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏:")

print(f"\n   Interaction Features (6):")
print(f"     ‚Ä¢ tag_overlap_count - –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –æ–±—â–∏—Ö —Ç–µ–≥–æ–≤")
print(f"     ‚Ä¢ tag_overlap_ratio - –¥–æ–ª—è –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏—è")
print(f"     ‚Ä¢ tag_jaccard - –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç –ñ–∞–∫–∫–∞—Ä–∞")
print(f"     ‚Ä¢ history_similarity - –≤–∑–≤–µ—à–µ–Ω–Ω–∞—è —Å—Ö–æ–∂–µ—Å—Ç—å")
print(f"     ‚Ä¢ embedding_cosine_sim - –∫–æ—Å–∏–Ω—É—Å–Ω–∞—è –±–ª–∏–∑–æ—Å—Ç—å (768-dim)")
print(f"     ‚Ä¢ embedding_euclidean_dist - –µ–≤–∫–ª–∏–¥–æ–≤–æ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ (–Ω–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω–æ)")

print(f"\n   User Features (4):")
print(f"     ‚Ä¢ avg_user_rating - —Å—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è")
print(f"     ‚Ä¢ ratings_count - –∞–∫—Ç–∏–≤–Ω–æ—Å—Ç—å")
print(f"     ‚Ä¢ tag_vocab_size - —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–æ–≤")
print(f"     ‚Ä¢ activity_score - –∫–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –º–µ—Ç—Ä–∏–∫–∞")

print(f"\n   Book Features (3):")
print(f"     ‚Ä¢ book_avg_rating - —Å—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥ –∫–Ω–∏–≥–∏")
print(f"     ‚Ä¢ book_ratings_count - –∏–∑–≤–µ—Å—Ç–Ω–æ—Å—Ç—å")
print(f"     ‚Ä¢ book_popularity - –∫–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –ø–æ–ø—É–ª—è—Ä–Ω–æ—Å—Ç—å")

print(f"\n   From Preprocessing (4):")
print(f"     ‚Ä¢ language_code_encoded - —è–∑—ã–∫ –∫–Ω–∏–≥–∏ (–∑–∞–∫–æ–¥–∏—Ä–æ–≤–∞–Ω)")
print(f"     ‚Ä¢ year_normalized - –≥–æ–¥ –ø—É–±–ª–∏–∫–∞—Ü–∏–∏ (–Ω–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω)")
print(f"     ‚Ä¢ publication_era - —ç–ø–æ—Ö–∞ (0-7)")
print(f"     ‚Ä¢ average_rating - —Å—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥ –∏–∑ Goodreads")

print(f"\n   –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ:")
print(f"     ‚Ä¢ segment - –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–π (new/inactive/active/very_active)")

total_features = len(feature_cols)
print(f"\n   –í—Å–µ–≥–æ —á–∏—Å–ª–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: {total_features}")

# –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–æ –ø—Ä–∏–∑–Ω–∞–∫–∞–º
print(f"\n –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ (train):")
print("\n", train_full[feature_cols].describe())

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö
metadata = {
    'num_users': len(user_stats),
    'num_books': len(book_stats),
    'train_size': len(train_full),
    'test_size': len(test_full),
    'num_features': total_features,
    'feature_names': feature_cols,
    'interaction_features': feature_cols[:6],
    'user_features': feature_cols[6:10],
    'book_features': feature_cols[10:13],
    'preprocessing_features': feature_cols[13:17],
    'segment_distribution': segment_counts.to_dict(),
    'embedding_dim': EMB_DIM
}

with open('features_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print(f"\n   –ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: features_metadata.pkl")

print("\n" + "="*80)
print(" FEATURE ENGINEERING –ó–ê–í–ï–†–®–ï–ù!")
print("="*80)

print(f"\n –°–æ–∑–¥–∞–Ω–Ω—ã–µ —Ñ–∞–π–ª—ã:")
print(f"  1. user_features.csv")
print(f"  2. book_features.csv")
print(f"  3. train_features_full.csv")
print(f"  4. test_features_full.csv")
print(f"  5. features_metadata.pkl")





[6/6] –§–ò–ù–ê–õ–¨–ù–ê–Ø –°–¢–ê–¢–ò–°–¢–ò–ö–ê

 –°–æ–∑–¥–∞–Ω–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏:

   Interaction Features (6):
     ‚Ä¢ tag_overlap_count - –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –æ–±—â–∏—Ö —Ç–µ–≥–æ–≤
     ‚Ä¢ tag_overlap_ratio - –¥–æ–ª—è –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏—è
     ‚Ä¢ tag_jaccard - –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç –ñ–∞–∫–∫–∞—Ä–∞
     ‚Ä¢ history_similarity - –≤–∑–≤–µ—à–µ–Ω–Ω–∞—è —Å—Ö–æ–∂–µ—Å—Ç—å
     ‚Ä¢ embedding_cosine_sim - –∫–æ—Å–∏–Ω—É—Å–Ω–∞—è –±–ª–∏–∑–æ—Å—Ç—å (768-dim)
     ‚Ä¢ embedding_euclidean_dist - –µ–≤–∫–ª–∏–¥–æ–≤–æ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ (–Ω–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω–æ)

   User Features (4):
     ‚Ä¢ avg_user_rating - —Å—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è
     ‚Ä¢ ratings_count - –∞–∫—Ç–∏–≤–Ω–æ—Å—Ç—å
     ‚Ä¢ tag_vocab_size - —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–æ–≤
     ‚Ä¢ activity_score - –∫–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –º–µ—Ç—Ä–∏–∫–∞

   Book Features (3):
     ‚Ä¢ book_avg_rating - —Å—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥ –∫–Ω–∏–≥–∏
     ‚Ä¢ book_ratings_count - –∏–∑–≤–µ—Å—Ç–Ω–æ—Å—Ç—å
