# Feature Engineering

Merging topic shares, sentiment, and external data (e.g., IPI).

In [33]:
import pandas as pd
def create_features(monthly_topic_shares_by_publisher_path: str, features_path: str, is_slda: bool = False):
    # Load data
    if is_slda:
        # For sLDA, load the predictions file which already has topics and IPI values
        df = pd.read_csv(monthly_topic_shares_by_publisher_path)
        sentiment = pd.read_csv("../data/processed/monthly_sentiment_all.csv")
        
        # Ensure date formats match
        df['month'] = pd.to_datetime(df['month'].astype(str))
        sentiment['month'] = pd.to_datetime(sentiment['date'].astype(str))
        
        # Convert to period and back for consistent formatting
        df['month'] = pd.to_datetime(df['month']).dt.to_period('M').dt.to_timestamp()
        sentiment['month'] = pd.to_datetime(sentiment['month']).dt.to_period('M').dt.to_timestamp()
        
        # Merge sentiment data
        df = df.merge(sentiment[['month', 'publication', 'sentiment']], 
                     on=['month', 'publication'], 
                     how='left')
        
        # Rename actual_ipi to INDPRO for consistency
        df = df.rename(columns={'actual_ipi': 'INDPRO'})
    else:
#def create_features(monthly_topic_shares_by_publisher_path: str, features_path: str):
        # Load data for non-sLDA case
        topics = pd.read_csv(monthly_topic_shares_by_publisher_path)
        sentiment = pd.read_csv("../data/processed/monthly_sentiment_all.csv")
        ipi = pd.read_csv("../data/raw/INDPRO.csv")

        # Ensure date formats match
        topics['month'] = pd.to_datetime(topics['month'].astype(str))
        sentiment['month'] = pd.to_datetime(sentiment['date'].astype(str))
        ipi['month'] = pd.to_datetime(ipi['observation_date'].astype(str))

        # For topic data
        topics['month'] = pd.to_datetime(topics['month']).dt.to_period('M').dt.to_timestamp()

        #  For sentiment data
        sentiment['month'] = pd.to_datetime(sentiment['date']).dt.to_period('M').dt.to_timestamp()

        # Merge on month + publication
        df = topics.merge(sentiment, on=['month', 'publication'], how='left')
        df = df.merge(ipi, on='month', how='left')

        # Drop rows with missing target (INDPRO)
        df = df.dropna(subset=['INDPRO'])
   
    # One-hot encode publisher
    df = pd.get_dummies(df, columns=['publication'], drop_first=True)

    # print preview of the final DataFrame
    cols = ['month', 'INDPRO', 'sentiment'] + [c for c in df.columns if c not in ('month', 'INDPRO', 'sentiment')]
    display(df[cols].head())

    # save features to csv
    df.to_csv(features_path, index=False)

LDA Features

In [34]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher.csv",
    features_path="../data/processed/features_lda_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,0.047303,0.108757,0.0654,0.177487,0.078956,0.089427,0.147791,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,0.048579,0.03406,0.039285,0.08041,0.048981,0.047445,0.082206,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,0.076843,0.100812,0.062806,0.020819,0.084967,0.164064,0.143699,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,0.178754,0.065679,0.069355,0.092416,0.028589,0.1464,0.133294,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,0.027663,0.115164,0.163197,0.309055,0.035086,0.034965,0.18921,...,False,False,False,False,False,False,False,False,False,False


BERTopic Features

In [35]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher_bertopic.csv",
    features_path="../data/processed/features_bertopic_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,0.019231,0.153846,0.153846,0.134615,0.25,0.076923,0.076923,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,0.05042,0.033613,0.067227,0.151261,0.571429,0.02521,0.058824,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,0.426966,0.073034,0.089888,0.016854,0.061798,0.129213,0.179775,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,0.085,0.05,0.195,0.07,0.265,0.11,0.19,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,0.006061,0.081818,0.330303,0.330303,0.051515,0.009091,0.006061,...,False,False,False,False,False,False,False,False,False,False


sLDA Features

In [36]:
#create_features(
#    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher_sLDA.csv",
#    features_path="../data/processed/features_slda_monthly.csv"
#)
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/slda_with_ipi_preds.csv",
    features_path="../data/processed/features_slda_monthly.csv",
    is_slda=True
)

Unnamed: 0,month,INDPRO,sentiment,predicted_ipi,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,...,publication_TMZ,publication_TechCrunch,publication_The Hill,publication_The New York Times,publication_The Verge,publication_Vice,publication_Vice News,publication_Vox,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.332366,101.013882,0.011177,0.03337,0.079773,0.097302,0.074727,0.098848,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.35976,101.046797,0.07762,0.006541,0.07078,0.080099,0.067432,0.057566,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.453634,101.018629,0.246734,0.025729,0.148172,0.097992,0.076219,0.111124,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.449364,100.983623,0.054624,0.092408,0.274799,0.152614,0.038277,0.093722,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.171123,100.931572,0.030501,0.068285,0.028256,0.108532,0.118862,0.07473,...,False,False,False,False,False,False,False,False,False,False


LLM Features

In [37]:
create_features(
    monthly_topic_shares_by_publisher_path="../data/processed/monthly_topic_shares_by_publisher_tinyllama.csv",
    features_path="../data/processed/features_tinyllm_monthly.csv"
)

Unnamed: 0,month,INDPRO,sentiment,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,publication_Mashable,publication_New Republic,publication_New Yorker,publication_Politico,publication_Reuters,publication_The Hill,publication_The Verge,publication_Vice News,publication_Washington Post,publication_Wired
0,2016-01-01,99.4391,-0.35976,0.409554,0.250494,0.23655,0.185237,0.215628,0.18643,0.224834,...,False,False,False,False,False,False,False,False,False,False
1,2016-01-01,99.4391,-0.453634,0.19636,0.249957,0.330842,0.273089,0.360972,0.224268,0.292444,...,False,False,False,False,False,False,False,False,False,False
2,2016-01-01,99.4391,-0.449364,0.253608,0.232634,0.307011,0.21125,0.246717,0.250021,0.244911,...,False,False,False,False,False,False,False,False,False,False
3,2016-01-01,99.4391,-0.171123,0.20911,0.354614,0.287122,0.241639,0.192669,0.263806,0.312532,...,False,False,False,False,False,False,False,False,False,False
4,2016-01-01,99.4391,-0.297804,0.240403,0.308231,0.253115,0.306913,0.204185,0.583569,0.31966,...,False,False,False,False,False,False,False,False,False,False
