# Feature Engineering

Merging topic shares, sentiment, and external data (e.g., IPI).

In [1]:
import pandas as pd

def create_features(monthly_topic_shares_by_publisher_path: str, features_path: str):
    # Load data
    topics = pd.read_csv(monthly_topic_shares_by_publisher_path)
    sentiment = pd.read_csv("../data/processed/monthly_sentiment_all.csv")
    ipi = pd.read_csv("../data/raw/INDPRO.csv")

    # Ensure date formats match
    topics['month'] = pd.to_datetime(topics['month'].astype(str))
    sentiment['month'] = pd.to_datetime(sentiment['date'].astype(str))
    ipi['month'] = pd.to_datetime(ipi['observation_date'].astype(str))

    # For topic data
    topics['month'] = pd.to_datetime(topics['month']).dt.to_period('M').dt.to_timestamp()

    # For sentiment data
    sentiment['month'] = pd.to_datetime(sentiment['date']).dt.to_period('M').dt.to_timestamp()

    # Merge on month + publication
    df = topics.merge(sentiment, on=['month', 'publication'], how='left')
    df = df.merge(ipi, on='month', how='left')

    # Drop rows with missing target (INDPRO)
    df = df.dropna(subset=['INDPRO'])

    # One-hot encode publisher
    df = pd.get_dummies(df, columns=['publication'], drop_first=True)

    # print preview of the final DataFrame
    cols = ['month', 'INDPRO', 'sentiment'] + [c for c in df.columns if c not in ('month', 'INDPRO', 'sentiment')]
    display(df[cols].head())

    # save features to csv
    df.to_csv(features_path, index=False)

LDA Features

In [11]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher.csv",
    features_path="../data/processed/features_lda_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,0.047303,0.108757,0.0654,0.177487,0.078956,0.089427,0.147791,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,0.048579,0.03406,0.039285,0.08041,0.048981,0.047445,0.082206,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,0.076843,0.100812,0.062806,0.020819,0.084967,0.164064,0.143699,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,0.178754,0.065679,0.069355,0.092416,0.028589,0.1464,0.133294,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,0.027663,0.115164,0.163197,0.309055,0.035086,0.034965,0.18921,...,False,False,False,False,False,False,False,False,False,False


BERTopic Features

In [12]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher_bertopic.csv",
    features_path="../data/processed/features_bertopic_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,0.096154,0.192308,0.153846,0.019231,0.134615,0.25,0.0,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,0.016807,0.042017,0.134454,0.05042,0.058824,0.588235,0.008403,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,0.1,0.05,0.016667,0.411111,0.266667,0.027778,0.016667,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,0.061674,0.154185,0.07489,0.061674,0.273128,0.189427,0.008811,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,0.036145,0.328313,0.307229,0.006024,0.01506,0.021084,0.0,...,False,False,False,False,False,False,False,False,False,False


sLDA Features

In [13]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher_sLDA.csv",
    features_path="../data/processed/features_slda_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,0.113924,0.096957,0.073923,0.136457,0.097931,0.152317,0.141147,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,0.057679,0.0441,0.071563,0.0616,0.053359,0.114329,0.438747,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,0.13586,0.049119,0.15522,0.033329,0.082012,0.049961,0.070695,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,0.035915,0.062259,0.288431,0.050607,0.053273,0.057251,0.194292,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,0.074303,0.018143,0.032679,0.324294,0.211499,0.059117,0.056758,...,False,False,False,False,False,False,False,False,False,False


LLM Features

In [2]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher_tinyllama.csv",
    features_path="../data/processed/features_tinyllm_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_Mashable,publication_New Republic,publication_New Yorker,publication_Politico,publication_Reuters,publication_The Hill,publication_The Verge,publication_Vice News,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.35976,0.154829,0.185236,0.203361,0.250484,0.215624,0.409553,0.224828,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.453634,0.308523,0.273096,0.355585,0.249958,0.360973,0.196365,0.292449,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.449364,0.200034,0.211251,0.226313,0.232621,0.246716,0.253615,0.244922,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.171123,0.201007,0.241645,0.210722,0.354607,0.192665,0.209123,0.312548,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.297804,0.202023,0.306932,0.183948,0.308225,0.204185,0.240401,0.319681,...,False,False,False,False,False,False,False,False,False,False
