# Financial Data Integration

In this notebook we integrate financial data for public traded companies.

For this purpose we keep the legal entities, sum the values of the brands holded by them, and finally drop the brands.


In [83]:
import pandas as pd
import yfinance as yf

In [84]:
# Clean data from previous notebook and ticks for public companies
clean_df = pd.read_csv("../clean_data/clean_data.csv")
companies = pd.read_csv("../scraped_data/public-companies.csv")

In [85]:
clean_df.head(1)

Unnamed: 0,date,brand,legal_entity,followers,pictures,videos,comments,likes
0,2017-09-23,24S,LVMH Moet Hennessy Louis Vuitton SE,19140.0,75.0,10.0,196.0,18853.0


In [86]:
companies.head(1)

Unnamed: 0,company_name,ticker
0,Capri Holdings,CPRI


In [87]:
# Merge public companies with data
public_companies = pd.merge(companies, clean_df, left_on='company_name', right_on='legal_entity', how='inner')

# Using an aggregation dictionary to include both sum for numeric data and 'first' for the ticker
df = public_companies.groupby(["legal_entity", 'date'], as_index=False).agg({
    'ticker': 'first',  # Keeps the first instance of 'ticker' for each group

    # Add other numeric fields here and specify 'sum' or other appropriate aggregation methods
    'followers': 'sum',
    'pictures': 'sum',
    'videos': 'sum',
    'comments': 'sum',
    'likes': 'sum'
})

# Display the first row to check the output
df.head(1)

Unnamed: 0,legal_entity,date,ticker,followers,pictures,videos,comments,likes
0,Abercrombie & Fitch,2015-01-03,ANF,1669930.0,5.0,1.0,908.0,43976.0


In [88]:
# Date to datetime
df['date'] = pd.to_datetime(df['date'])

In [89]:
def fetch_yahoo_finance_data(ticker, start_date, end_date):
    stock = yf.Ticker(ticker)
    # Fetch historical data within the specified date range and interval
    hist = stock.history(start=start_date, end=end_date, interval="1wk")
    hist.reset_index(inplace=True)
    hist["Ticker"] = ticker  # Add a column to identify the ticker
    return hist

In [90]:
# Fetch data for each ticker from Yahoo Finance

tickers = df["ticker"].unique()
finance_data = []

for ticker in tickers:
    dates_of_interest = df[df["ticker"]==ticker]["date"]
    yahoo_data = fetch_yahoo_finance_data(ticker, min(dates_of_interest), max(dates_of_interest) + pd.Timedelta(days=3))
    finance_data.append(yahoo_data)

finance_data = pd.concat(finance_data, ignore_index=True)
finance_data.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Capital Gains
0,2015-01-01 00:00:00-05:00,22.424204,23.613186,22.130921,23.248564,9830400,0.0,0.0,ANF,
1,2015-01-08 00:00:00-05:00,23.589407,23.858909,21.893126,22.471764,9245200,0.0,0.0,ANF,
2,2015-01-15 00:00:00-05:00,22.400424,22.590661,20.18099,20.799261,8034600,0.0,0.0,ANF,
3,2015-01-22 00:00:00-05:00,20.838899,21.671186,20.577321,20.704147,7314100,0.0,0.0,ANF,
4,2015-01-29 00:00:00-05:00,20.783413,20.870605,18.960307,20.212702,13216200,0.0,0.0,ANF,


In [91]:
 # Let´s only keep the Closing price (of "next monday") for predicting purposes and Date, with some transformations
finance_data["Date"] = finance_data["Date"].map(lambda x: str(x).split(" ")[0])
finance_data.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Capital Gains
0,2015-01-01,22.424204,23.613186,22.130921,23.248564,9830400,0.0,0.0,ANF,


In [92]:
finance_data = finance_data[["Date", "Close", "Ticker"]]
finance_data.head(1)

Unnamed: 0,Date,Close,Ticker
0,2015-01-01,23.248564,ANF


In [93]:
finance_data['Date'] = pd.to_datetime(finance_data['Date'])

In [94]:
# Add 2 days to 'period_end_date' in cleaned_df to account for the delay
df['date_adjusted'] = df['date'] + pd.Timedelta(days=9)

In [111]:
# Merge df with finance_data using the adjusted dates and ticker matching
# The 'inner' join means the result will only include rows that have matching values in both DataFrames
merged_df = pd.merge(df, finance_data, left_on=['ticker', 'date_adjusted'], right_on=['Ticker', 'Date'], how='inner')

In [112]:
merged_df.head(1)

Unnamed: 0,legal_entity,date,ticker,followers,pictures,videos,comments,likes,date_adjusted,Date_x,Close_x,Ticker_x,Date_y,Close_y,Ticker_y
0,Academy Sports + Outdoors,2020-09-19,ASO,168956.0,8.0,5.0,485.0,7592.0,2020-09-28,2020-09-28,12.789742,ASO,2020-09-28,12.789742,ASO


In [113]:
# rename Close_x to close:
merged_df.rename(columns={'Close_x': 'close'}, inplace=True)
cols_to_drop = ['ticker', 'date_adjusted', "Date_x", "Ticker_x", "Date_y", "Close_y", "Ticker_y"]
# Drop columns that are not needed for the model
merged_df.drop(columns=cols_to_drop, inplace=True)
merged_df.head(1)

Unnamed: 0,legal_entity,date,followers,pictures,videos,comments,likes,close
0,Academy Sports + Outdoors,2020-09-19,168956.0,8.0,5.0,485.0,7592.0,12.789742


In [116]:
# save data to csv
merged_df.to_csv("../clean_data/financial_data_integrated.csv", index=False)