# Influencer Feature Normalization for Engagement Prediction

This notebook implements the normalization process for influencer features as described in the Feature Normalization Plan. It includes log transforms, engagement rate features, follower tiering, percentile calculations, peer-relative features, and robust scaling for use in the V4 engagement prediction model.

## 1. Import Libraries and Load Data

Import required libraries and load the feature-engineered dataset.

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import RobustScaler
import os

# Load feature-engineered data
data_path = os.path.join('..', 'data', 'feature_engineered_data.csv')
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,name,slno,content,reactions,comments,time_spent,location,followers,media_type,num_hashtags,...,reactions_vs_influencer_avg,comments_vs_influencer_avg,reactions_per_word,comments_per_word,reactions_per_sentiment,media_x_optimal_length,video_x_optimal_length,hook_x_power_score,sentiment_x_readability,feature_density
0,Nicholas Wyman,0,Robert Lerman writes that achieving a healthy...,12.0,1,1 day ago,,6484.0,article,4,...,0.552941,0.598726,0.190476,0.015873,12.522175,0,0,0,37.475177,0.079365
1,Nicholas Wyman,1,"National disability advocate Sara Hart Weir, ...",11.0,0,1 week ago,,6484.0,,0,...,0.506863,0.0,0.785714,0.0,37.162162,0,0,0,12.299857,0.214286
2,Nicholas Wyman,3,Exploring in this months Talent Management & H...,44.0,0,2 months ago,,6484.0,article,4,...,2.027451,0.0,0.611111,0.0,45.90985,0,0,0,46.918472,0.152778
3,Nicholas Wyman,4,I count myself fortunate to have spent time wi...,22.0,2,2 months ago,,6484.0,article,3,...,1.013725,1.197452,0.196429,0.017857,22.741369,0,0,0,40.99561,0.142857
4,Nicholas Wyman,5,Online job platforms are a different way of wo...,21.0,1,2 months ago,,6484.0,article,5,...,0.967647,0.598726,0.4375,0.020833,58.414465,0,0,0,15.729383,0.0625


## 2. Log Transformation of Count-Based Features

Apply `np.log1p` to all specified count-based features and create new columns prefixed with 'log_'.

In [2]:
# List of count-based features to log-transform
log_features = [
    'influencer_avg_engagement',
    'influencer_total_engagement',
    'influencer_avg_reactions',
    'influencer_avg_comments',
    'influencer_median_reactions',
    'influencer_median_comments',
    'followers'
]

for col in log_features:
    if col in df.columns:
        df[f'log_{col}'] = np.log1p(df[col])
df[[f'log_{col}' for col in log_features if f'log_{col}' in df.columns]].head()

Unnamed: 0,log_influencer_avg_engagement,log_influencer_total_engagement,log_influencer_avg_reactions,log_influencer_avg_comments,log_influencer_median_reactions,log_influencer_median_comments,log_followers
0,3.193449,7.695303,3.122459,0.982158,2.995732,0.693147,8.777247
1,3.193449,7.695303,3.122459,0.982158,2.995732,0.693147,8.777247
2,3.193449,7.695303,3.122459,0.982158,2.995732,0.693147,8.777247
3,3.193449,7.695303,3.122459,0.982158,2.995732,0.693147,8.777247
4,3.193449,7.695303,3.122459,0.982158,2.995732,0.693147,8.777247


## 3. Calculate Engagement Rate Features

Create new features by dividing historical engagement metrics by follower count (with zero replaced by one to avoid division by zero).

In [3]:
# Avoid division by zero for followers
followers_safe = df['followers'].replace(0, 1) if 'followers' in df.columns else 1

if 'influencer_avg_engagement' in df.columns:
    df['influencer_engagement_rate'] = df['influencer_avg_engagement'] / followers_safe
if 'influencer_avg_reactions' in df.columns:
    df['influencer_reactions_rate'] = df['influencer_avg_reactions'] / followers_safe
if 'influencer_avg_comments' in df.columns:
    df['influencer_comments_rate'] = df['influencer_avg_comments'] / followers_safe
if 'influencer_std_reactions' in df.columns:
    df['influencer_engagement_rate_consistency'] = df['influencer_std_reactions'] / followers_safe
df[['influencer_engagement_rate','influencer_reactions_rate','influencer_comments_rate']].head()

Unnamed: 0,influencer_engagement_rate,influencer_reactions_rate,influencer_comments_rate
0,0.003605,0.003347,0.000258
1,0.003605,0.003347,0.000258
2,0.003605,0.003347,0.000258
3,0.003605,0.003347,0.000258
4,0.003605,0.003347,0.000258


## 4. Assign Follower Tiers and One-Hot Encode

Define follower tier boundaries, assign each row to a tier, and one-hot encode the tier as separate columns (is_tier_micro, is_tier_small, etc.).

In [4]:
# Define tier boundaries
tiers = {
    'micro': (0, 5000),
    'small': (5000, 20000),
    'medium': (20000, 100000),
    'large': (100000, 500000),
    'macro': (500000, float('inf'))
}

def assign_tier(followers):
    if followers < 5000:
        return 'micro'
    elif followers < 20000:
        return 'small'
    elif followers < 100000:
        return 'medium'
    elif followers < 500000:
        return 'large'
    else:
        return 'macro'

if 'followers' in df.columns:
    df['follower_tier'] = df['followers'].apply(assign_tier)
    df = pd.get_dummies(df, columns=['follower_tier'], prefix='is_tier')
df[[col for col in df.columns if col.startswith('is_tier_')]].head()

Unnamed: 0,is_tier_large,is_tier_macro,is_tier_medium,is_tier_micro,is_tier_small
0,False,False,False,False,True
1,False,False,False,False,True
2,False,False,False,False,True
3,False,False,False,False,True
4,False,False,False,False,True


## 5. Calculate Percentile-Within-Tier Features

For each tier, calculate the percentile rank of influencer_avg_engagement, influencer_avg_reactions, and influencer_avg_comments within that tier.

In [5]:
# Calculate percentiles within each tier (if tier and features exist)
tier_col = None
for col in df.columns:
    if col.startswith('is_tier_'):
        tier_col = col.replace('is_tier_', '')
        break

if 'follower_tier_micro' in df.columns or 'follower_tier' in df.columns or 'is_tier_micro' in df.columns:
    # Try to recover the original tier label for grouping
    if 'follower_tier' in df.columns:
        group_col = 'follower_tier'
    else:
        # Reconstruct from one-hot if needed
        group_col = None
    if group_col is None and any(c.startswith('is_tier_') for c in df.columns):
        # Reconstruct tier label
        def get_tier(row):
            for t in tiers.keys():
                if f'is_tier_{t}' in row and row[f'is_tier_{t}'] == 1:
                    return t
            return np.nan
        df['tier_label'] = df.apply(get_tier, axis=1)
        group_col = 'tier_label'
    else:
        group_col = 'follower_tier'

    if 'influencer_avg_engagement' in df.columns:
        df['percentile_engagement_in_tier'] = df.groupby(group_col)['influencer_avg_engagement'].rank(pct=True)
    if 'influencer_avg_reactions' in df.columns:
        df['percentile_reactions_in_tier'] = df.groupby(group_col)['influencer_avg_reactions'].rank(pct=True)
    if 'influencer_avg_comments' in df.columns:
        df['percentile_comments_in_tier'] = df.groupby(group_col)['influencer_avg_comments'].rank(pct=True)
df[['percentile_engagement_in_tier','percentile_reactions_in_tier','percentile_comments_in_tier']].head()

Unnamed: 0,percentile_engagement_in_tier,percentile_reactions_in_tier,percentile_comments_in_tier
0,0.568171,0.568171,0.365039
1,0.568171,0.568171,0.365039
2,0.568171,0.568171,0.365039
3,0.568171,0.568171,0.365039
4,0.568171,0.568171,0.365039


## 6. Compute Peer-Relative Features

For each row, compute the ratio of influencer_avg_engagement to the median engagement for their tier (engagement_vs_tier_median).

In [6]:
# Compute engagement_vs_tier_median
if 'influencer_avg_engagement' in df.columns and ('tier_label' in df.columns or 'follower_tier' in df.columns):
    group_col = 'tier_label' if 'tier_label' in df.columns else 'follower_tier'
    tier_medians = df.groupby(group_col)['influencer_avg_engagement'].transform('median')
    df['engagement_vs_tier_median'] = df['influencer_avg_engagement'] / tier_medians
df[['engagement_vs_tier_median']].head()

Unnamed: 0,engagement_vs_tier_median
0,1.569047
1,1.569047
2,1.569047
3,1.569047
4,1.569047


## 7. Apply Robust Scaling to Log-Transformed Features

Fit a RobustScaler on the log-transformed features and transform them. Save the scaler object for production use.

In [7]:
# Apply RobustScaler to log-transformed features
log_cols = [col for col in df.columns if col.startswith('log_')]
scaler = RobustScaler()
df[log_cols] = scaler.fit_transform(df[log_cols])

# Save scaler for production use
import joblib
os.makedirs(os.path.join('..', 'models_v4'), exist_ok=True)
scaler_path = os.path.join('..', 'models_v4', 'robust_scaler.pkl')
joblib.dump(scaler, scaler_path)
df[log_cols].head()

Unnamed: 0,log_influencer_avg_engagement,log_influencer_total_engagement,log_influencer_avg_reactions,log_influencer_avg_comments,log_influencer_median_reactions,log_influencer_median_comments,log_followers
0,-0.504589,-1.223896,-0.469815,-0.59176,-0.239628,-0.169092,-2.090323
1,-0.504589,-1.223896,-0.469815,-0.59176,-0.239628,-0.169092,-2.090323
2,-0.504589,-1.223896,-0.469815,-0.59176,-0.239628,-0.169092,-2.090323
3,-0.504589,-1.223896,-0.469815,-0.59176,-0.239628,-0.169092,-2.090323
4,-0.504589,-1.223896,-0.469815,-0.59176,-0.239628,-0.169092,-2.090323


## 8. Save Normalized Data and Metadata

Save the normalized dataframe to 'data/feature_engineered_normalized.csv'. Save tier boundaries, tier medians, and transformation date to 'data/normalization_metadata.json'.

In [8]:
# Save normalized data
output_data_path = os.path.join('..', 'data', 'feature_engineered_normalized.csv')
df.to_csv(output_data_path, index=False)

# Save tier boundaries and medians for production use
tier_metadata = {
    'tier_boundaries': tiers,
    'tier_medians': df.groupby('tier_label')['influencer_avg_engagement'].median().to_dict() if 'tier_label' in df.columns else {},
    'transformation_date': '2026-02-13'
}
output_meta_path = os.path.join('..', 'data', 'normalization_metadata.json')
with open(output_meta_path, 'w') as f:
    json.dump(tier_metadata, f, indent=2)

print(f"Saved normalized data to {output_data_path}")
print(f"Saved normalization metadata to {output_meta_path}")

Saved normalized data to ..\data\feature_engineered_normalized.csv
Saved normalization metadata to ..\data\normalization_metadata.json
