In [6]:
import pandas as pd
# Data downloaded as of 31 Mar 2024
# Focus on quarterly real data
# Excluded discontinued variables

def pad_columns_to_09Q3(df, name):
    na_cols = [f'{name}65Q4']
    na_cols.extend([f'{name}{i}Q{j}' for i in range(66, 100) for j in range(1, 5)])
    na_cols.extend([f'{name}0{i}Q{j}' for i in range(0, 9) for j in range(1, 5)])
    na_cols.extend([f'{name}09Q1', f'{name}09Q2'])
    na_df = pd.DataFrame(columns=na_cols)
    df = pd.concat([na_df, df], axis=1)
    df.index.names = ['DATE']
    df.fillna(-1, inplace=True)
    return df

def pad_columns_to_98Q4(df, name):
    na_cols = [f'{name}65Q4']
    na_cols.extend([f'{name}{i}Q{j}'for i in range(66, 98) for j in range(1, 5)])
    na_cols.extend([f'{name}98Q1', f'{name}98Q2', f'{name}98Q3'])
    na_df = pd.DataFrame(columns=na_cols)
    df = pd.concat([na_df, df], axis=1)
    df.index.names = ["DATE"]
    df.fillna(-1, inplace=True)
    return df

def month_to_quarter(df, name):
    # Filter middle months
    col_months = df.columns
    col_middle_months = []
    for month in col_months:
        if month[-2:] in ("M2", "M5", "M8", "11"):
            col_middle_months.append(month)
    df = df[col_middle_months]

    # Convert month to quarter
    col_quarters = []
    for col in col_middle_months:
        month = col[-2:]
        if month == "M2":
            col_quarters.append(col[:-2] + "Q1")
        elif month == "M5":
            col_quarters.append(col[:-2] + "Q2")
        elif month == "M8":
            col_quarters.append(col[:-2] + "Q3")
        elif month == "11":
            col_quarters.append(col[:-3] + "Q4")
    df.columns = col_quarters

    return df

def rows_to_quarter(df):
    middle_months = []
    for year in range(1947, 2024):
        for month in (2, 5, 8):
            middle_months.append(f'{year}:0{month}')
        middle_months.append(f'{year}:11')
    df = df.loc[middle_months]

    time_quarters = [f'{year}:Q{quarter}' for year in range(1947, 2024) for quarter in range(1, 5)]
    df.index = time_quarters
    return df

In [7]:
RCON = pd.read_excel("./data/project data/RCONQvQd.xlsx", index_col="DATE").fillna(-1)

In [8]:
rcong = pd.read_excel("./data/project data/rcongQvQd.xlsx", index_col="DATE")
rcong = pad_columns_to_09Q3(rcong, "rcong")

In [9]:
RCONND = pd.read_excel("./data/project data/RCONNDQvQd.xlsx", index_col="DATE").fillna(-1)

In [10]:
RCOND = pd.read_excel("./data/project data/RCONDQvQd.xlsx", index_col="DATE").fillna(-1)

In [11]:
RCONS = pd.read_excel("./data/project data/RCONSQvQd.xlsx", index_col="DATE").fillna(-1)

In [12]:
rconshh = pd.read_excel("./data/project data/rconshhQvQd.xlsx", index_col="DATE")
rconshh = pad_columns_to_09Q3(rconshh, "rconshh")

In [13]:
rconsnp = pd.read_excel("./data/project data/rconsnpQvQd.xlsx", index_col="DATE")
rconsnp = pad_columns_to_09Q3(rconsnp, "rconsnp")

In [14]:
rinvbf = pd.read_excel("./data/project data/rinvbfQvQd.xlsx", index_col="DATE").fillna(-1)

In [15]:
rinvresid = pd.read_excel("./data/project data/rinvresidQvQd.xlsx", index_col="DATE").fillna(-1)

In [16]:
rinvchi = pd.read_excel("./data/project data/rinvchiQvQd.xlsx", index_col="DATE").fillna(-1)

In [17]:
RNX = pd.read_excel("./data/project data/RNXQvQd.xlsx", index_col="DATE").fillna(-1)

In [18]:
REX = pd.read_excel("./data/project data/REXQvQd.xlsx", index_col="DATE").fillna(-1)

In [19]:
RIMP = pd.read_excel("./data/project data/RIMPQvQd.xlsx", index_col="DATE").fillna(-1)

In [20]:
RG = pd.read_excel("./data/project data/RGQvQd.xlsx", index_col="DATE").fillna(-1)

In [21]:
RGF = pd.read_excel("./data/project data/RGFQvQd.xlsx", index_col="DATE").fillna(-1)

In [22]:
RGSL = pd.read_excel("./data/project data/RGSLQvQd.xlsx", index_col="DATE").fillna(-1)

In [23]:
rconhh = pd.read_excel("./data/project data/rconhhQvQd.xlsx", index_col="DATE")
rconhh = pad_columns_to_09Q3(rconhh, "rconhh")

In [24]:
WSD = pd.read_excel("./data/project data/wsdQvQd.xlsx", index_col="DATE").fillna(-1)

In [25]:
OLI = pd.read_excel("./data/project data/oliQvQd.xlsx", index_col="DATE").fillna(-1)

In [26]:
PROPI = pd.read_excel("./data/project data/propiQvQd.xlsx", index_col="DATE").fillna(-1)

In [27]:
RENTI = pd.read_excel("./data/project data/rentiQvQd.xlsx", index_col="DATE").fillna(-1)

In [28]:
DIV = pd.read_excel("./data/project data/divQvQd.xlsx", index_col="DATE").fillna(-1)

In [29]:
PINTI = pd.read_excel("./data/project data/pintiQvQd.xlsx", index_col="DATE").fillna(-1)

In [30]:
TRANR = pd.read_excel("./data/project data/tranrQvQd.xlsx", index_col="DATE").fillna(-1)

In [31]:
SSCONTRIB = pd.read_excel("./data/project data/sscontribQvQd.xlsx", index_col="DATE").fillna(-1)

In [32]:
NPI = pd.read_excel("./data/project data/npiQvQd.xlsx", index_col="DATE").fillna(-1)

In [33]:
PTAX = pd.read_excel("./data/project data/ptaxQvQd.xlsx", index_col="DATE").fillna(-1)

In [34]:
NDPI = pd.read_excel("./data/project data/ndpiQvQd.xlsx", index_col="DATE").fillna(-1)

In [35]:
NCON = pd.read_excel("./data/project data/nconQvQd.xlsx", index_col="DATE").fillna(-1)

In [36]:
PINTPAID = pd.read_excel("./data/project data/pintpaidQvQd.xlsx", index_col="DATE").fillna(-1)

In [37]:
TRANPF = pd.read_excel("./data/project data/tranpfQvQd.xlsx", index_col="DATE").fillna(-1)

In [38]:
NPSAV = pd.read_excel("./data/project data/npsavQvQd.xlsx", index_col="DATE").fillna(-1)

In [39]:
RATESAV = pd.read_excel("./data/project data/ratesavQvQd.xlsx", index_col="DATE").fillna(-1)

In [40]:
NCPROFAT = pd.read_excel("./data/project data/NCPROFATQvQd.xlsx", index_col="DATE").fillna(-1)

In [41]:
NCPROFATW = pd.read_excel("./data/project data/NCPROFATWQvQd.xlsx", index_col="DATE")
NCPROFATW.drop(index=["1946:Q1", "1946:Q2", "1946:Q3", "1946:Q4"], inplace=True)
na_cols_NCPROFATW = [f'NCPROFATW65Q4']
na_cols_NCPROFATW.extend([f'NCPROFATW{i}Q{j}' for i in range(66, 81) for j in range (1, 5)])
na_df_NCPROFATW = pd.DataFrame(columns=na_cols_NCPROFATW)
NCPROFATW = pd.concat([na_df_NCPROFATW, NCPROFATW], axis=1)
NCPROFATW.index.names = ["DATE"]
NCPROFATW.fillna(-1, inplace=True)

In [42]:
M1 = pd.read_excel("./data/project data/m1QvMd.xlsx", index_col="DATE").fillna(-1)
M1 = rows_to_quarter(M1)


In [43]:
M2 = pd.read_excel("./data/project data/m2QvMd.xlsx", index_col="DATE").fillna(-1)
M2 = rows_to_quarter(M2)

In [44]:
CPI = pd.read_excel("./data/project data/cpiQvMd.xlsx", index_col="DATE").fillna(-1)
CPI = rows_to_quarter(CPI)

In [45]:
PCPIX = pd.read_excel("./data/project data/pcpixMvMd.xlsx", index_col="DATE")
PCPIX = month_to_quarter(PCPIX, "PCPIX")
PCPIX = pad_columns_to_98Q4(PCPIX, "PCPIX")
PCPIX = rows_to_quarter(PCPIX)

In [46]:
PPPI = pd.read_excel("./data/project data/pppiMvMd.xlsx", index_col="DATE")
PPPI = month_to_quarter(PPPI, "PPPI")
PPPI = pad_columns_to_98Q4(PPPI, "PPPI")
PPPI = rows_to_quarter(PPPI)

In [47]:
PPPIX = pd.read_excel("./data/project data/pppixMvMd.xlsx", index_col="DATE")
PPPIX = month_to_quarter(PPPIX, "PPPIX")
PPPIX = pad_columns_to_98Q4(PPPIX, "PPPIX")
PPPIX = rows_to_quarter(PPPIX)

In [48]:
P = pd.read_excel("./data/project data/PQvQd.xlsx", index_col="DATE").fillna(-1)

In [49]:
PCON = pd.read_excel("./data/project data/pconQvQd.xlsx", index_col="DATE").fillna(-1)

In [50]:
pcong = pd.read_excel("./data/project data/pcongQvQd.xlsx", index_col="DATE")
pcong = pad_columns_to_09Q3(pcong, "pcong")

In [51]:
pconshh = pd.read_excel("./data/project data/pconshhQvQd.xlsx", index_col="DATE")
pconshh = pad_columns_to_09Q3(pconshh, "pconshh")

In [52]:
pconsnp = pd.read_excel("./data/project data/pconsnpQvQd.xlsx", index_col="DATE")
pconsnp = pad_columns_to_09Q3(pconsnp, "pconsnp")

In [53]:
pconhh = pd.read_excel("./data/project data/pconhhQvQd.xlsx", index_col="DATE")
pconhh = pad_columns_to_09Q3(pconhh, "pconhh")

In [54]:
PCONX = pd.read_excel("./data/project data/PCONXQvQd.xlsx", index_col="DATE")
na_cols_PCONX = ["PCONX65Q4"]
na_cols_PCONX.extend([f'PCONX{i}Q{j}'for i in range(66, 96) for j in range(1, 5)])
na_df_PCONX = pd.DataFrame(columns=na_cols_PCONX)
PCONX = pd.concat([na_df_PCONX, PCONX], axis=1)
PCONX.index.names = ["DATE"]
PCONX.fillna(-1, inplace=True)

In [55]:
PIMP = pd.read_excel("./data/project data/pimpQvQd.xlsx", index_col="DATE").fillna(-1)

In [56]:
POP = pd.read_excel("./data/project data/popMvMd.xlsx", index_col="DATE")
POP = month_to_quarter(POP, "POP")
POP = pad_columns_to_98Q4(POP, "POP")
POP = rows_to_quarter(POP)

In [57]:
LFC = pd.read_excel("./data/project data/lfcMvMd.xlsx", index_col="DATE")
LFC = month_to_quarter(LFC, "LFC")
LFC = pad_columns_to_98Q4(LFC, "LFC")
LFC = rows_to_quarter(LFC)

In [58]:
LFPART = pd.read_excel("./data/project data/lfpartMvMd.xlsx", index_col="DATE")
LFPART = month_to_quarter(LFPART, "LFPART")
LFPART = pad_columns_to_98Q4(LFPART, "LFPART")
LFPART = rows_to_quarter(LFPART)

In [59]:
RUC = pd.read_excel("./data/project data/rucQvMd.xlsx", index_col="DATE").fillna(-1)
RUC = rows_to_quarter(RUC)

In [60]:
EMPLOY = pd.read_excel("./data/project data/employMvMd.xlsx", index_col="DATE")
EMPLOY = month_to_quarter(EMPLOY, "EMPLOY")
EMPLOY.drop(columns=["EMPLOY65Q1", "EMPLOY65Q2", "EMPLOY65Q3"], inplace=True)
EMPLOY.fillna(-1, inplace=True)
EMPLOY = rows_to_quarter(EMPLOY)

In [61]:
H = pd.read_excel("./data/project data/hMvMd.xlsx", index_col="DATE")
H = month_to_quarter(H, "H")
na_cols_H = ["H65Q4"]
na_cols_H.extend([f'H{i}Q{j}' for i in range(66, 71) for j in range (1, 5)])
na_cols_H.extend(["H71Q1", "H71Q2", "H71Q3"])
na_df_H = pd.DataFrame(columns=na_cols_H)
H = pd.concat([na_df_H, H], axis=1)
H.index.names = ["DATE"]
H.fillna(-1, inplace=True)
H = rows_to_quarter(H)

In [62]:
HG = pd.read_excel("./data/project data/hgMvMd.xlsx", index_col="DATE")
HG = month_to_quarter(HG, "HG")
na_cols_HG = ["HG65Q4"]
na_cols_HG.extend([f'HG{i}Q{j}' for i in range(66, 71) for j in range (1, 5)])
na_cols_HG.extend(["HG71Q1", "HG71Q2", "HG71Q3"])
na_df_HG = pd.DataFrame(columns=na_cols_HG)
HG = pd.concat([na_df_HG, HG], axis=1)
HG.index.names = ["DATE"]
HG.fillna(-1, inplace=True)
HG = rows_to_quarter(HG)

In [63]:
HS = pd.read_excel("./data/project data/hsMvMd.xlsx", index_col="DATE")
HS = month_to_quarter(HS, "HS")
na_cols_HS = ["HS65Q4"]
na_cols_HS.extend([f'HS{i}Q{j}' for i in range(66, 71) for j in range (1, 5)])
na_cols_HS.extend(["HS71Q1", "HS71Q2", "HS71Q3"])
na_df_HS = pd.DataFrame(columns=na_cols_HS)
HS = pd.concat([na_df_HS, HS], axis=1)
HS.index.names = ["DATE"]
HS.fillna(-1, inplace=True)
HS = rows_to_quarter(HS)

In [64]:
OPH = pd.read_excel("./data/project data/OPHQvQd.xlsx", index_col="DATE")
OPH = pad_columns_to_98Q4(OPH, "OPH")

In [65]:
ULC = pd.read_excel("./data/project data/ULCQvQd.xlsx", index_col="DATE")
ULC = pad_columns_to_98Q4(ULC, "ULC")

In [66]:
IPT = pd.read_excel("./data/project data/iptMvMd.xlsx", index_col="DATE")
IPT = month_to_quarter(IPT, "IPT")
IPT.drop(columns=["IPT62Q4",
                  "IPT63Q1", "IPT63Q2", "IPT63Q3", "IPT63Q4",
                  "IPT64Q1", "IPT64Q2", "IPT64Q3", "IPT64Q4",
                  "IPT65Q1", "IPT65Q2", "IPT65Q3"], inplace=True)
IPT.fillna(-1, inplace=True)
IPT = rows_to_quarter(IPT)

In [67]:
IPM = pd.read_excel("./data/project data/ipmMvMd.xlsx", index_col="DATE")
IPM = month_to_quarter(IPM, "IPM")
IPM.drop(columns=["IPM62Q4",
                  "IPM63Q1", "IPM63Q2", "IPM63Q3", "IPM63Q4",
                  "IPM64Q1", "IPM64Q2", "IPM64Q3", "IPM64Q4",
                  "IPM65Q1", "IPM65Q2", "IPM65Q3"], inplace=True)
IPM.fillna(-1, inplace=True)
IPM = rows_to_quarter(IPM)

In [68]:
CUT = pd.read_excel("./data/project data/cutMvMd.xlsx", index_col="DATE")
na_rows_CUT = pd.DataFrame(columns=CUT.columns, index=["1947:02", "1947:05", "1947:08", "1947:11"])
CUT = pd.concat([na_rows_CUT, CUT], axis=0)
CUT = month_to_quarter(CUT, "CUT")
na_cols_CUT = ["CUT65Q4"]
na_cols_CUT.extend([f'CUT{i}Q{j}' for i in range(66, 83) for j in range(1, 5)])
na_cols_CUT.extend(["CUT83Q1", "CUT83Q2"])
na_df_CUT = pd.DataFrame(columns=na_cols_CUT)
CUT = pd.concat([na_df_CUT, CUT], axis=1)
CUT.index.names = ["DATE"]
CUT.fillna(-1, inplace=True)
CUT = rows_to_quarter(CUT)

In [69]:
CUM = pd.read_excel("./data/project data/cumMvMd.xlsx", index_col="DATE")
na_rows_CUM = pd.DataFrame(columns=CUM.columns, index=["1947:02", "1947:05", "1947:08", "1947:11"])
CUM = pd.concat([na_rows_CUM, CUM], axis=0)
CUM = month_to_quarter(CUM, "CUM")
na_cols_CUM = ["CUM65Q4"]
na_cols_CUM.extend([f'CUM{i}Q{j}' for i in range(66, 79) for j in range(1, 5)])
na_cols_CUM.extend(["CUM79Q1", "CUM79Q2"])
na_df_CUM = pd.DataFrame(columns=na_cols_CUM)
CUM = pd.concat([na_df_CUM, CUM], axis=1)
CUM.index.names = ["DATE"]
CUM.fillna(-1, inplace=True)
CUM = rows_to_quarter(CUM)

In [70]:
HSTARTS = pd.read_excel("./data/project data/hstartsMvMd.xlsx", index_col="DATE")
HSTARTS = month_to_quarter(HSTARTS, "HSTARTS")
na_cols_HSTARTS = ["HSTARTS65Q4",
                   "HSTARTS66Q1", "HSTARTS66Q2", "HSTARTS66Q3", "HSTARTS66Q4",
                   "HSTARTS67Q1", "HSTARTS67Q2", "HSTARTS67Q3", "HSTARTS67Q4"]
na_df_HSTARTS = pd.DataFrame(columns=na_cols_HSTARTS)
HSTARTS = pd.concat([na_df_HSTARTS, HSTARTS], axis=1)
HSTARTS.index.names = ["DATE"]
HSTARTS.fillna(-1, inplace=True)
HSTARTS = rows_to_quarter(HSTARTS)

In [None]:
# Real GNP/GDP (ROUTPUT)
# https://www.philadelphiafed.org/surveys-and-data/real-time-data-research/routput
ROUTPUT = pd.read_excel("./data/project data/ROUTPUTQvQd.xlsx", index_col="DATE").fillna(-1)

In [None]:
macro_variables = [RCON, rcong, RCONND, RCOND, RCONS, rconshh, rconsnp, rinvbf, rinvresid,
                   rinvchi, RNX, REX, RIMP, RG, RGF, RGSL, rconhh, WSD, OLI, PROPI, RENTI,
                   DIV, PINTI, TRANR, SSCONTRIB, NPI, PTAX, NDPI, NCON, PINTPAID, TRANPF,
                   NPSAV, RATESAV, NCPROFAT, NCPROFATW, M1, M2, CPI, PCPIX, PPPI, PPPIX,
                   P, PCON, pcong, pconshh, pconsnp, pconhh, PCONX, PIMP, POP, LFC, LFPART,
                   RUC, EMPLOY, H, HG, HS, OPH, ULC, IPT, IPM, CUT, CUM, HSTARTS, ROUTPUT]

with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(macro_variables, f)