# Feature Engineering

Merging topic shares, sentiment, and external data (e.g., IPI).

## sLDA features

In [1]:
import pandas as pd

# Load data
topics = pd.read_csv("../data/processed/monthly_topic_shares_by_publisher_sLDA.csv")
sentiment = pd.read_csv("../data/processed/monthly_sentiment_all.csv")
ipi = pd.read_csv("../data/raw/INDPRO.csv")

# Ensure date formats match
topics['month'] = pd.to_datetime(topics['month'].astype(str))
sentiment['month'] = pd.to_datetime(sentiment['date'].astype(str))
ipi['month'] = pd.to_datetime(ipi['observation_date'].astype(str))

# For topic data
topics['month'] = pd.to_datetime(topics['month']).dt.to_period('M').dt.to_timestamp()

# For sentiment data
sentiment['month'] = pd.to_datetime(sentiment['date']).dt.to_period('M').dt.to_timestamp()

# Merge on month + publication
df = topics.merge(sentiment, on=['month', 'publication'], how='left')
df = df.merge(ipi, on='month', how='left')

# Drop rows with missing target (INDPRO)
df = df.dropna(subset=['INDPRO'])

In [2]:
# One-hot encode publisher
df = pd.get_dummies(df, columns=['publication'], drop_first=True)

In [3]:
# print preview of the final DataFrame
cols = ['month', 'INDPRO', 'sentiment'] + [c for c in df.columns if c not in ('month', 'INDPRO', 'sentiment')]
display(df[cols].head())

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,0.113924,0.096957,0.073923,0.136457,0.097931,0.152317,0.141147,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,0.057679,0.0441,0.071563,0.0616,0.053359,0.114329,0.438747,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,0.13586,0.049119,0.15522,0.033329,0.082012,0.049961,0.070695,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,0.035915,0.062259,0.288431,0.050607,0.053273,0.057251,0.194292,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,0.074303,0.018143,0.032679,0.324294,0.211499,0.059117,0.056758,...,False,False,False,False,False,False,False,False,False,False


In [4]:
# save features to csv
df.to_csv("../data/processed/features_slda_monthly.csv", index=False)