# Feature Engineering

Merging topic shares, sentiment, and external data (e.g., IPI).

In [17]:
import pandas as pd

def create_features(monthly_topic_shares_by_publisher_path: str, features_path: str):
    # Load data
    topics = pd.read_csv(monthly_topic_shares_by_publisher_path)
    sentiment = pd.read_csv("../data/processed/monthly_sentiment_all.csv")
    ipi = pd.read_csv("../data/raw/INDPRO.csv")

    # Ensure date formats match
    topics['month'] = pd.to_datetime(topics['month'].astype(str))
    sentiment['month'] = pd.to_datetime(sentiment['date'].astype(str))
    ipi['month'] = pd.to_datetime(ipi['observation_date'].astype(str))

    # For topic data
    topics['month'] = pd.to_datetime(topics['month']).dt.to_period('M').dt.to_timestamp()

    # For sentiment data
    sentiment['month'] = pd.to_datetime(sentiment['date']).dt.to_period('M').dt.to_timestamp()

    # Merge on month + publication
    df = topics.merge(sentiment, on=['month', 'publication'], how='left')
    df = df.merge(ipi, on='month', how='left')

    # Drop rows with missing target (INDPRO)
    df = df.dropna(subset=['INDPRO'])

    # One-hot encode publisher
    df = pd.get_dummies(df, columns=['publication'], drop_first=True)

    # print preview of the final DataFrame
    cols = ['month', 'INDPRO', 'sentiment'] + [c for c in df.columns if c not in ('month', 'INDPRO', 'sentiment')]
    display(df[cols].head())

    # save features to csv
    df.to_csv(features_path, index=False)

LDA Features

In [18]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher.csv",
    features_path="../data/processed/features_lda_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,0.047303,0.108757,0.0654,0.177487,0.078956,0.089427,0.147791,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,0.048579,0.03406,0.039285,0.08041,0.048981,0.047445,0.082206,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,0.076843,0.100812,0.062806,0.020819,0.084967,0.164064,0.143699,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,0.178754,0.065679,0.069355,0.092416,0.028589,0.1464,0.133294,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,0.027663,0.115164,0.163197,0.309055,0.035086,0.034965,0.18921,...,False,False,False,False,False,False,False,False,False,False


BERTopic Features

In [19]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher_bertopic.csv",
    features_path="../data/processed/features_bertopic_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,0.019231,0.153846,0.096154,0.153846,0.269231,0.076923,0.076923,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,0.05042,0.058824,0.016807,0.142857,0.579832,0.008403,0.067227,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,0.426966,0.073034,0.078652,0.005618,0.061798,0.123596,0.202247,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,0.072115,0.182692,0.043269,0.072115,0.240385,0.100962,0.206731,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,0.009091,0.372727,0.048485,0.3,0.042424,0.015152,0.006061,...,False,False,False,False,False,False,False,False,False,False


sLDA Features

In [20]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher_sLDA.csv",
    features_path="../data/processed/features_slda_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,0.011177,0.03337,0.079773,0.097302,0.074727,0.098848,0.103577,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,0.07762,0.006541,0.07078,0.080099,0.067432,0.057566,0.065173,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,0.246734,0.025729,0.148172,0.097992,0.076219,0.111124,0.125382,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,0.054624,0.092408,0.274799,0.152614,0.038277,0.093722,0.038525,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,0.030501,0.068285,0.028256,0.108532,0.118862,0.07473,0.098407,...,False,False,False,False,False,False,False,False,False,False


LLM Features

In [21]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher_tinyllama.csv",
    features_path="../data/processed/features_tinyllm_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_Mashable,publication_New Republic,publication_New Yorker,publication_Politico,publication_Reuters,publication_The Hill,publication_The Verge,publication_Vice News,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.35976,0.154844,0.185296,0.203359,0.250485,0.215621,0.40955,0.224827,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.453634,0.30854,0.273117,0.355583,0.249963,0.360973,0.196366,0.292444,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.449364,0.200046,0.211304,0.226311,0.232623,0.246707,0.253609,0.244901,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.171123,0.201018,0.241688,0.210722,0.354613,0.192661,0.209108,0.312546,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.297804,0.202041,0.306991,0.183948,0.308218,0.204174,0.240398,0.319653,...,False,False,False,False,False,False,False,False,False,False
