In [1]:

# Building a sentiment features and merge with weekly market data

import pandas as pd
from pathlib import Path

# Define project folders (relative to /analysis)
PROJECT_DIR = Path("..")  # goes up one level (Final Project DSAP/)
CLEAN_DIR   = PROJECT_DIR / "datasets" / "clean_data"
RAW_DIR     = PROJECT_DIR / "datasets" / "raw_data"

# Load weekly Google Trends and market data
gt_path      = CLEAN_DIR / "google_trends_weekly.csv"
market_path  = RAW_DIR   / "market_weekly_W-FRI.csv"

gt = pd.read_csv(gt_path, parse_dates=["Date"])
market = pd.read_csv(market_path, parse_dates=["Date"])

print("Google Trends weekly shape:", gt.shape)
print("Market weekly shape:", market.shape)

gt.head(), market.head()


Google Trends weekly shape: (270, 21)
Market weekly shape: (310, 6)


(        Date  banking_crisis  bear_market  bull_market  business_confidence  \
 0 2015-11-01              60            3           14                   47   
 1 2015-11-08              47            4           14                   38   
 2 2015-11-15              49            3           15                   64   
 3 2015-11-22              46            2            8                   34   
 4 2015-11-29              71            3           14                   61   
 
    consumer_confidence  debt_crisis  economic_growth  economic_recession  \
 0                   34           52               70                  29   
 1                   28           66               65                  31   
 2                   28           62               65                  31   
 3                   26           46               39                  17   
 4                   26           65               69                  27   
 
    economic_recovery  ...  inflation  job_creation  j

In [2]:
# Align Google Trends weekly dates to the Friday of the same week

# Make sure Date is datetime
gt["Date"] = pd.to_datetime(gt["Date"])

# Convert to weekly period ending on Friday
gt["week_FRI"] = gt["Date"].dt.to_period("W-FRI")

# Take the END of the period (Friday), not the start (Saturday)
gt["Date_FRI"] = gt["week_FRI"].dt.to_timestamp(how="end")

# Clean dataframe: keep only one Date column
gt_aligned = gt.drop(columns=["Date", "week_FRI"]).rename(columns={"Date_FRI": "Date"})

print("Aligned GT weekly dates (first 5):")
print(gt_aligned["Date"].head())


Aligned GT weekly dates (first 5):
0   2015-11-06 23:59:59.999999999
1   2015-11-13 23:59:59.999999999
2   2015-11-20 23:59:59.999999999
3   2015-11-27 23:59:59.999999999
4   2015-12-04 23:59:59.999999999
Name: Date, dtype: datetime64[ns]


In [3]:
import pandas as pd
from pathlib import Path

# Load weekly SP500/VIX data
PROJECT_DIR = Path("..")  
MARKET_PATH = PROJECT_DIR / "datasets" / "raw_data" / "market_weekly_W-FRI.csv"

market_weekly = pd.read_csv(MARKET_PATH)

print("Loaded market_weekly shape:", market_weekly.shape)
market_weekly.head()


Loaded market_weekly shape: (310, 6)


Unnamed: 0,Date,SP500,SP500_return_w,SP500_vol_w_4,VIX,VIX_change_w
0,2015-01-30,1994.98999,-0.028088,0.018237,20.969999,0.230082
1,2015-02-06,2055.469971,0.029865,0.026354,17.290001,-0.192964
2,2015-02-13,2096.98999,0.019998,0.025684,14.69,-0.162961
3,2015-02-20,2110.300049,0.006327,0.025321,14.3,-0.026907
4,2015-02-27,2104.5,-0.002752,0.01444,13.34,-0.069492


In [4]:
market_weekly["Date"] = pd.to_datetime(market_weekly["Date"])
market_weekly["week_FRI"] = market_weekly["Date"].dt.to_period("W-FRI")
market_weekly["Date_FRI"] = market_weekly["week_FRI"].dt.to_timestamp(how="end")

market_aligned = market_weekly.drop(columns=["Date", "week_FRI"]).rename(columns={"Date_FRI": "Date"})

print("Aligned market weekly dates (first 5):")
print(market_aligned["Date"].head())


Aligned market weekly dates (first 5):
0   2015-01-30 23:59:59.999999999
1   2015-02-06 23:59:59.999999999
2   2015-02-13 23:59:59.999999999
3   2015-02-20 23:59:59.999999999
4   2015-02-27 23:59:59.999999999
Name: Date, dtype: datetime64[ns]


In [5]:
print(market_aligned["Date"].head())
print(market_aligned["Date"].dt.day_name().head())
#check that it's on friday

0   2015-01-30 23:59:59.999999999
1   2015-02-06 23:59:59.999999999
2   2015-02-13 23:59:59.999999999
3   2015-02-20 23:59:59.999999999
4   2015-02-27 23:59:59.999999999
Name: Date, dtype: datetime64[ns]
0    Friday
1    Friday
2    Friday
3    Friday
4    Friday
Name: Date, dtype: object


In [6]:
merged = pd.merge(
    gt_aligned, 
    market_aligned, 
    on="Date", 
    how="inner"
)

print("Merged dataset shape:", merged.shape)
merged.head()



Merged dataset shape: (270, 26)


Unnamed: 0,banking_crisis,bear_market,bull_market,business_confidence,consumer_confidence,debt_crisis,economic_growth,economic_recession,economic_recovery,financial_crisis,...,stock_market_optimism,stock_market_rally,strong_economy,unemployement,Date,SP500,SP500_return_w,SP500_vol_w_4,VIX,VIX_change_w
0,60,3,14,47,34,52,70,29,24,42,...,0,0,44,5,2015-11-06 23:59:59.999999999,2099.199951,0.009496,0.007619,14.33,-0.050351
1,47,4,14,38,28,66,65,31,20,48,...,0,21,44,6,2015-11-13 23:59:59.999999999,2023.040039,-0.036955,0.024985,20.08,0.337369
2,49,3,15,64,28,62,65,31,25,45,...,0,0,36,6,2015-11-20 23:59:59.999999999,2089.169922,0.032165,0.028771,15.47,-0.260822
3,46,2,8,34,26,46,39,17,16,33,...,0,0,24,5,2015-11-27 23:59:59.999999999,2090.110107,0.00045,0.028775,15.12,-0.022884
4,71,3,14,61,26,65,69,27,28,48,...,0,0,41,6,2015-12-04 23:59:59.999999999,2091.689941,0.000756,0.028272,14.81,-0.020716


In [7]:
# Save the merged weekly dataset (Google Trends + Market Data)

from pathlib import Path

# Path to save the cleaned dataset
OUTPUT_PATH = Path("../datasets/clean_data/merged_weekly.csv")

# Save without index
merged.to_csv(OUTPUT_PATH, index=False)

print("Merged weekly dataset saved at:")
print(OUTPUT_PATH.resolve())


Merged weekly dataset saved at:
/files/Final Project DSAP/datasets/clean_data/merged_weekly.csv
