# Feature Engineering

Merging topic shares, sentiment, and external data (e.g., IPI).

## LDA features

In [1]:
import pandas as pd

# Load data
topics = pd.read_csv("../data/processed/monthly_topic_shares_by_publisher.csv")
sentiment = pd.read_csv("../data/processed/monthly_sentiment_all.csv")
ipi = pd.read_csv("../data/raw/INDPRO.csv")

# Ensure date formats match
topics['month'] = pd.to_datetime(topics['month'].astype(str))
sentiment['month'] = pd.to_datetime(sentiment['date'].astype(str))
ipi['month'] = pd.to_datetime(ipi['observation_date'].astype(str))

# For topic data
topics['month'] = pd.to_datetime(topics['month']).dt.to_period('M').dt.to_timestamp()

# For sentiment data
sentiment['month'] = pd.to_datetime(sentiment['date']).dt.to_period('M').dt.to_timestamp()

# Merge on month + publication
df = topics.merge(sentiment, on=['month', 'publication'], how='left')
df = df.merge(ipi, on='month', how='left')

# Drop rows with missing target (INDPRO)
df = df.dropna(subset=['INDPRO'])

In [2]:
# One-hot encode publisher
df = pd.get_dummies(df, columns=['publication'], drop_first=True)

In [5]:
# print preview of the final DataFrame
cols = ['month', 'INDPRO', 'sentiment'] + [c for c in df.columns if c not in ('month', 'INDPRO', 'sentiment')]
display(df[cols].head())

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,0.047303,0.108757,0.0654,0.177487,0.078956,0.089427,0.147791,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,0.048579,0.03406,0.039285,0.08041,0.048981,0.047445,0.082206,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,0.076843,0.100812,0.062806,0.020819,0.084967,0.164064,0.143699,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,0.178754,0.065679,0.069355,0.092416,0.028589,0.1464,0.133294,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,0.027663,0.115164,0.163197,0.309055,0.035086,0.034965,0.18921,...,False,False,False,False,False,False,False,False,False,False


In [6]:
# save features to csv
df.to_csv("../data/processed/features_lda_monthly.csv", index=False)

## BERTopic features

In [None]:
import pandas as pd

# Load data
topics = pd.read_csv("../data/processed/monthly_topic_shares_by_publisher_bertopic.csv")
sentiment = pd.read_csv("../data/processed/monthly_sentiment_all.csv")
ipi = pd.read_csv("../data/raw/INDPRO.csv")

# Ensure date formats match
topics['month'] = pd.to_datetime(topics['month'].astype(str))
sentiment['month'] = pd.to_datetime(sentiment['date'].astype(str))
ipi['month'] = pd.to_datetime(ipi['observation_date'].astype(str))

# For topic data
topics['month'] = pd.to_datetime(topics['month']).dt.to_period('M').dt.to_timestamp()

# For sentiment data
sentiment['month'] = pd.to_datetime(sentiment['date']).dt.to_period('M').dt.to_timestamp()

# Merge on month + publication
df = topics.merge(sentiment, on=['month', 'publication'], how='left')
df = df.merge(ipi, on='month', how='left')

# Drop rows with missing target (INDPRO)
df = df.dropna(subset=['INDPRO'])

# One-hot encode publisher
df = pd.get_dummies(df, columns=['publication'], drop_first=True)

# print preview of the final DataFrame
cols = ['month', 'INDPRO', 'sentiment'] + [c for c in df.columns if c not in ('month', 'INDPRO', 'sentiment')]
display(df[cols].head())

In [None]:
# save features to csv
df.to_csv("../data/processed/features_bertopic_monthly.csv", index=False)