In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import json
import time
from tqdm.auto import tqdm
from multiprocessing import Pool
import glob

In [None]:
# # !python -m spacy download en_core_web_sm
# import spacy

# nlp = spacy.load('en_core_web_sm')

In [None]:
def make_request(url, params=None, headers=None, max_retries = 5):

    current_tries = 1
    while current_tries < max_retries:
        response = requests.get(url=url, params=params, headers=headers)
        
        if response.status_code == 200:
            return response
        else:
            time.sleep(1)
            current_tries += 1
            
    return requests.get(url=url, params=params, headers=headers)

# Companies by SIC 

In [None]:
def SIC_search(search_params):
    """
    Search the list of companies by SIC code
    >>> https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&filenum=&State=&Country=&SIC=1000&myowner=exclude&action=getcompany
    """
    
    url = "https://www.sec.gov/cgi-bin/browse-edgar"
    
    headers = {
        'User-Agent': 'kuleuven amin.tavakkolnia@kuleuven.be',
        'Host': 'www.sec.gov'
    }

    res = make_request(url, params=search_params, headers=headers, max_retries=10)

    search = BeautifulSoup(res.content, 'html.parser')
    
    try:
        search_df = pd.read_html(str(search.table))[0]
        search_df['SIC'] = search_params['SIC']
        return search_df
    except:
        return None

In [None]:
# Retreive list of all SIC codes from www.sec.gov
res = make_request(
    url='https://www.sec.gov/search-filings/standard-industrial-classification-sic-code-list',
    headers = {
        'User-Agent': 'kuleuven amin.tavakkolnia@kuleuven.be',
        'Host': 'www.sec.gov'
    }
)
SIC_list = pd.read_html(res.content)[0]
SIC_list.to_excel('SIC_list.xlsx', index=False)

In [None]:
SIC_list = pd.read_excel('SIC_list.xlsx')
SIC_list.head()

In [None]:
all_companies = pd.DataFrame()

for i, sic in enumerate(SIC_list.index):
    start_cnt = 0
    search_params = {
        'action': 'getcompany',
        'SIC': sic,
        'start': start_cnt,
        'count': 100,
        'owner': 'include'
    }
    com = SIC_search(search_params)

    while com is not None:
        com['Industry'] = SIC_list.loc[sic, 'Office']
        all_companies = pd.concat([all_companies, com])
        start_cnt += 100
        search_params = {
            'action': 'getcompany',
            'SIC': sic,
            'start': start_cnt,
            'count': 100,
            'owner': 'include'
        }
        com = SIC_search(search_params)

    if i%50 == 0:
        print(f"{i} SICs added")
        all_companies.to_csv('all_companies.csv', index=False)
    time.sleep(1)
    
all_companies.to_excel('all_companies.xlsx', index=False)

# Get list of filings per company

In [None]:
all_companies = pd.read_excel('all_companies.xlsx', index_col=0)
all_companies.sample(5)

In [None]:
def get_filings(CIK):
    headers = {
    'User-Agent': 'kuleuven amin.tavakkolnia@kuleuven.be'
    }
    res = make_request(f"https://data.sec.gov/submissions/CIK{CIK:010d}.json", headers=headers)
    filings = res.json()['filings']
    recent_files = filings['recent']
    temp = pd.DataFrame(recent_files)
    
    if filings.get('files'):
        add_files_name = filings['files'][0]['name']
        add_files = make_request(f"https://data.sec.gov/submissions/{add_files_name}", headers=headers).json()
        add_files_df = pd.DataFrame(add_files)

        temp = pd.concat([temp, add_files_df])

    temp["CIK"] = CIK
    
    return temp

https://www.sec.gov/Archives/edgar/data/1632053/000107997419000277/apotheca10k_1312019.htm

In [None]:
accessionNumber = '000110465906084288'
doc_name = 'a06-25759_210k.htm'
url = "https://www.sec.gov/Archives/edgar/data/" + str(CIK) + "/" + accessionNumber + "/" + doc_name

report = make_request(url, max_retries=10)

html = BeautifulSoup(report.content, 'html.parser')

print(url)

In [None]:
import string
import nltk
from nltk.corpus import stopwords

punc = set(string.punctuation)
stop = stopwords.words('english')

In [None]:
filings_df = pd.read_csv("Data/filings_df.csv")
filings_df['url'] = (
    "https://www.sec.gov/Archives/edgar/data/" + 
    filings_df["cik"].astype(str) + "/" + 
    filings_df['accessionNumber'].str.replace('-', '') + "/" + 
    filings_df["primaryDocument"])

In [None]:
headers = {
'User-Agent': 'kuleuven amin.tavakkolnia'
}
res = make_request(filings_df['url'][0], max_retries=10, headers=headers)
html = BeautifulSoup(res.content, 'html.parser')
text = html.body.text

In [None]:
def get_length(url):

    res = make_request(url=url, max_retries=10, headers=headers)
    html = BeautifulSoup(res.content, 'html.parser')
    text = html.body.text
    
    lower = text.lower()
    alpha = ''.join(x for x in lower if x.isalpha() or x.isspace())
    # punc_free = ''.join(x for x in num_free if x not in punc)
    stop_free = [x for x in alpha.split() if x not in stop]

    return len(stop_free)

In [None]:
filings_df['url'][:10].apply(get_length)

In [None]:
# regex to get "Item 1A"

# item = 'Item 1A risk factors'.lower()
# pattern = "(item[\-_\s]?1a)?[\s\-_:]*(risk factor[s])?"

# re.fullmatch(pattern=pattern, string=item)

# def find_risks(tag):
#     pattern = "(item[\-_\s]?1a[\.]?)[\s\-_:]*(risk factor[s]?)"
#     if re.fullmatch(pattern=pattern, string=str(tag.string).lower()):
#         return True
#     else:
#         return False
    
# tags = html.find_all(find_risks)

# SEC API

In [None]:
from sec_api import QueryApi, ExtractorApi, MappingApi
import html
import gc
import pandas as pd
from tqdm.auto import tqdm
import time
import re

extractorApi = ExtractorApi("355b10a7c3b55716e8d0ec69c6b24c724d99b050369b066d94fe94dce289b65e")
queryApi = QueryApi(api_key="355b10a7c3b55716e8d0ec69c6b24c724d99b050369b066d94fe94dce289b65e")
mappingApi = MappingApi(api_key='355b10a7c3b55716e8d0ec69c6b24c724d99b050369b066d94fe94dce289b65e')

In [None]:
urls10K_df = pd.read_csv("Data/10Kurls.csv").dropna(subset=['linkToFilingDetails', 'periodOfReport'])
urls10K_df['filerCIK'] = urls10K_df['linkToFilingDetails'].apply(lambda x: x.split('/')[6])

In [None]:
import re
with open('SEC_api.o56288177', 'r') as f:
    errors = f.read()

missed_urls = re.findall(pattern=r"https://[^\s]*\.(?:htm|txt)", string=errors)

urls10K_df = urls10K_df[urls10K_df['linkToFilingDetails'].isin(missed_urls)]

In [None]:
query = {
  "query": { "query_string": { 
      "query": "formType:\"13F-HR\" " + 
               "AND NOT formType:\"13F-HR/A\" " +
               "AND filedAt:[2014-01-01 TO 2014-02-01]",
      "time_zone": "America/New_York"
  } },
  "from": "0",
  "size": "20",
  "sort": [{ "filedAt": { "order": "desc" } }]
}

response = queryApi.get_filings(query)

In [None]:
len(response['filings'])

In [None]:
def Flatten_holdings(filing):
    # Extract general data
    general_data = dict([(key, filing.get(key)) for key in ['formType', 'cik', 'filedAt', 'periodOfReport']])

    # Flatten holdings
    flattened_holdings = []
    for holding in filing['holdings']:
        flat_holding = general_data.copy()
        flat_holding.update({
            'cusip': holding.get('cusip'),
            'holding_cik': holding.get('cik'),
            'otherManager': holding.get('otherManager'),
            'investmentDiscretion': holding.get('investmentDiscretion'),
            'value': holding.get('value'),
            'titleOfClass': holding.get('titleOfClass')
        })
        flattened_holdings.append(flat_holding)

    return flattened_holdings

holdings_list = []

for filing in response['filings']:
    holdings_list.extend(Flatten_holdings(filing))

holdings_df = pd.DataFrame(holdings_list)

In [None]:
urls10K_df = pd.read_csv("Data/10Kurls.csv").dropna(subset=['linkToFilingDetails', 'periodOfReport'])
CIKs = urls10K_df['cik'].unique()

def cik_tic_map(cik, retries=3):
    """
    map CIK to ticker, CUSIP and company details 
    """
    for attempt in range(retries):
        try:
            result = mappingApi.resolve('cik', str(cik))
            return result
        except Exception as e:
            time.sleep(5)
            if attempt == retries - 1:
                print(e)
                pass

output = [cik_tic_map(cik) for cik in CIKs[:10]]

In [None]:
cik_ticker_df = pd.DataFrame([x for X in output for x in X]).drop_duplicates(subset=['cik', 'ticker'])
cik_ticker_df['cik'] = cik_ticker_df['cik'].astype(int)

cik_ticker_df = cik_ticker_df[cik_ticker_df['cik'].isin(CIKs)].reset_index(drop=True)

In [None]:
cik_ticker_df.to_csv('Data/CIK_Ticker_CUSIP.csv', index=False)

In [None]:
base_query = {
  "query": { 
      "query_string": { 
          "query": "PLACEHOLDER", # this will be set during runtime 
          "time_zone": "America/New_York"
      } 
  },
  "from": "0", # starting point in the list of urls
  "size": "200", # number of data points returned in every call
  # sort by filedAt
  "sort": [{ "filedAt": { "order": "desc" } }]
}

In [None]:
# 'NT 10-K' = Non-Timely 10-K - Not a annual report 10-K filing
# '10-K/A' filing amendment
universe_query = "formType:\"10-K\" " + "AND NOT formType:\"NT 10-K\" " + "AND filedAt:[2022-01-01 TO 2022-01-31]"
base_query["query"]["query_string"]["query"] = universe_query;
response = queryApi.get_filings(base_query)

urls_list = list(map(
    lambda x: [x.get(key) for key in ['linkToFilingDetails', 'cik', 'ticker', 
                                      'filedAt', 'periodOfReport', 'formType']], 
    response["filings"]
))

urls10K_df = pd.DataFrame(urls_list, columns=['linkToFilingDetails', 'cik', 'ticker', 
                                      'filedAt', 'periodOfReport', 'formType'])

urls10K_df['filedAt'] = urls10K_df['filedAt'].apply(lambda x: x.split('T')[0])

In [None]:
section_text = extractorApi.get_section(filing_url=urls10K_df['linkToFilingDetails'][0], section='1A', return_type='html')

### Submissions

In [None]:
# Get all submissions
submissions = glob.glob("Data\submissions\*")

firm_info = []
for file in tqdm(submissions):
    with open(file, 'rb') as f:
        content = json.load(f)
        try:
            firm_info.append([content.get(key) for key in ["cik", "entityType", "sic", "category"]])
        except:
            continue

firm_info_df = pd.DataFrame(
    firm_info, 
    columns=["CIK", "entityType", "SIC", "category"]
)

firm_info_df.to_csv("firm_info.csv", index=False)

In [None]:
firm_info_df = pd.read_csv("firm_info.csv")
firm_info_df["CIK"] = firm_info_df["CIK"].astype(int)

tickers = (
    firm_info_df.set_index(["CIK", "SIC"])['tickers']
    .str.strip('[]').str.replace("'", "")
    .replace(r'^\s*$', np.nan, regex=True)
    .str.split(",").dropna()
    .explode()
).reset_index()

"""
tik_txt = '\n'.join(tickers.str.replace("\s*'*[*]*", ""))

with open('tickers.txt', "w") as f:
    f.write(tik_txt)
"""

tickers

In [None]:
all_files = pd.read_csv("Data/all_files.csv")
CIKs = "\n".join(all_files.CIK.astype(str).unique().tolist())
with open('CIK.txt', "w") as f:
    f.write(CIKs)

# EIKON

In [1]:
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from tqdm.auto import tqdm
import time
import gc

In [2]:
# import refinitiv.data as rd
# rd.open_session()

# rd.close_session()

### Get data

In [3]:
# filings_df = pd.read_csv("Data\clean_docs_3.csv", index_col=0)
T2V_df = pd.read_csv(
    filepath_or_buffer="Top2Vec/T2V_df_5.csv", 
    parse_dates=['Report_dt', 'Filing_dt'], 
    usecols=['CIK', 'Report_dt', 'Filing_dt']
).rename(columns={'Report_dt': 'report_dt', 'Filing_dt': 'filing_dt'}).drop_duplicates()

In [4]:
T2V_df = T2V_df[(T2V_df['report_dt']>'2006-01-01')&(T2V_df['report_dt']<'2024-01-01')]
obs_df = T2V_df.groupby("CIK").agg({"report_dt": "min", "filing_dt": "max"}).reset_index()

In [5]:
# obs_df = T2V_df[["CIK", "filing_dt"]].groupby("CIK")['filing_dt'].agg(["min", 'max']).reset_index()
# obs_df["SDate"] = (
#     obs_df["min"].dt.date - relativedelta(months=6)
#     ).astype(str)

# obs_df["EDate"] = (
#     obs_df["max"].dt.date + relativedelta(months=6)
#     ).astype(str)
    
obs_df["SDate"] = obs_df["report_dt"].dt.date.astype(str)

obs_df["EDate"] = obs_df["filing_dt"].dt.date.astype(str)

obs_df["CIK"] = obs_df["CIK"].apply(lambda cik: f"{cik:010d}")

obs = obs_df.to_records()

In [6]:
import eikon as ek
# ek.set_app_key('f85c7bd3ede24dae99baad798c810fed013b9769') #1
ek.set_app_key('916a59e4580e4d908d2318335926c91a4a1b1851') #2
# ek.set_app_key('9f63922b6e2f465393b927960fd87954bc4a73d8') #3

### Prices

In [None]:
data = []

for o in tqdm(np.array_split(obs, 500)[:50]):
    SDate = o["SDate"].min()
    EDate = o["EDate"].max()

    while True:
        try:
            df = rd.get_data(
                universe=o["CIK"].tolist(), 
                fields = [
                    'TR.CLOSEPRICE(Adjusted=1)',
                    'TR.CLOSEPRICE.date',
                    'TR.OPENPRICE(Adjusted=1)',
                    # 'TR.OPENPRICE.date',
                    # 'TR.PRICECLOSE',
                    # 'TR.PRICECLOSEDATE', 
                    'TR.Volume',
                    'TR.Volume.date',
                    # 'TR.TtlCmnSharesOut(Period=FQ0)',
                    # 'TR.TtlCmnSharesOut(Period=FQ0).date'
                ], 
                parameters={'SDate': SDate, 'EDate': EDate},
                # use_field_names_in_headers=True
            )
            if df is not None:
                data.append(df.dropna(subset=['Date'], how='all').drop_duplicates())
        
            gc.collect()
            break
        
        except Exception as e:
            print(f"E: {e}")
            time.sleep(5)
            continue

In [None]:
data_df = pd.concat(data)

In [None]:
rd.close_session()
data.to_csv("Data\EIKON_prices.csv", index=False)

### Free Float

In [None]:
data = []

for o in tqdm(np.array_split(obs, 500)):
    SDate = o["SDate"].min()
    EDate = o["EDate"].max()

    while True:
        try:
            df = rd.get_data(
                universe=o["CIK"].tolist(), 
                fields = [
                    'TR.SharesFreeFloat',
                    'TR.SharesFreeFloat.date',
                    'TR.FreeFloat',
                    'TR.FreeFloat.date'
                ], 
                parameters={'SDate': SDate, 'EDate': EDate},
            )

            if df is not None:
                data.append(df.dropna(subset=['Date'], how='all').drop_duplicates())
        
            gc.collect()
            break
        
        except Exception as e:
            print(f"E: {e}")
            time.sleep(5)
            continue

In [None]:
data.to_csv("Data\EIKON_FreeFloat.csv", index=False)

### Bid-Ask

In [None]:
data = pd.DataFrame()

for o in tqdm(obs):
    SDate = o["SDate"]
    EDate = o["EDate"]

    while True:
        try:
            df, err = ek.get_data(
                instruments=o["CIK"], 
                fields = [
                    'TR.HIGHPRICE(Adjusted=1)',
                    'TR.HIGHPRICE.date',
                    'TR.LOWPRICE(Adjusted=1)',
                    'TR.LOWPRICE.date',
                    'TR.BIDPRICE(Adjusted=1)',
                    'TR.BIDPRICE.date',
                    'TR.ASKPRICE(Adjusted=1)',
                    'TR.ASKPRICE.date'
                ], 
                parameters={'SDate': SDate, 'EDate': EDate},
                field_name=True
            )
            if not err:
                data = pd.concat([data, df])
            break
        except:
            time.sleep(20)
            continue

In [None]:
data.to_csv("Data\EIKON_bidask.csv", index=False)

### Beta

In [None]:
data = pd.DataFrame()

for o in tqdm(obs):
    SDate = o["SDate"]
    EDate = o["EDate"]

    while True:
        try:
            df, err = ek.get_data(
                instruments=o["CIK"], 
                fields = [
                    'TR.WACCBeta',
                    'TR.WACCBeta.date',
                    'TR.BetaDaily180D',
                    'TR.BetaDaily180D.date',
                    'TR.BetaDaily90D',
                    'TR.BetaDaily90D.date',
                    'TR.WACC',
                    'TR.WACC.date',
                ], 
                parameters={'SDate': SDate, 'EDate': EDate},
                field_name=True
            )
            if not err:
                data = pd.concat([data, df])
            break
        except:
            time.sleep(30)
            continue

In [None]:
data.to_csv("Data\EIKON_beta.csv", index=False)

In [None]:
help(ek.get_data)
ek.get_symbology("MSFC", from_symbol_type='ticker', to_symbol_type=['RIC', 'ISIN'])

### Analysts

In [None]:
data = []

for o in tqdm(np.array_split(obs, 500)):
    SDate = o["SDate"].min()
    EDate = o["EDate"].max()
    
    while True:
        try:
            df, err = ek.get_data(
                instruments=o["CIK"].tolist(), 
                fields = [
                    'TR.NumberOfAnalysts',
                    'TR.NumberOfAnalysts.date',
                    # 'TR.ARMIntraCountryScore',
                    # 'TR.ARMIntraIndustryScore',
                    # 'TR.SIInstitutionalOwn',
                ], 
                parameters={'SDate': SDate, 'EDate': EDate},
                field_name=True
            )
            
            if df is not None:
                data.append(df.drop_duplicates())

                gc.collect()
                break
                
        except Exception as e:
            print(f"E: {e}")
            time.sleep(1)
            continue

In [None]:
data_df = pd.concat(data)
data_df.columns = ['Instrument', 'NUMBEROFANALYSTS', 'Date', 'None']
data_df.drop(columns="None", inplace=True)
data_df['Date'] = pd.to_datetime(data_df['Date'], errors='coerce').dt.tz_localize(None)
data_df.dropna(subset=['Date'], inplace=True)

In [None]:
data_df.shape

In [None]:
data_df.to_csv("Data/Analysts.csv", index=False)

### Financials

In [None]:
data = pd.DataFrame()

for o in tqdm(np.array_split(obs, 200)):
    SDate = o["SDate"].min()
    EDate = o["EDate"].max()

    while True:
        try:
            df, err = ek.get_data(
                instruments=o["CIK"].tolist(), 
                fields = [
                    'TR.TotalAssetsReported(Period=FY0).date',
                    'TR.TotalAssetsReported(Period=FY0)',
                    'TR.TotalDebtOutstanding(Period=FY0).date',
                    'TR.TotalDebtOutstanding(Period=FY0)',
                    'TR.NetIncome(Period=FY0).date',
                    'TR.NetIncome(Period=FY0)',
                    'TR.TotalRevenue(Period=FY0).date',
                    'TR.TotalRevenue(Period=FY0)',
                    'TR.TotalEquity(Period=FY0).date',
                    'TR.TotalEquity(Period=FY0)',
                    'TR.IntangiblesNet(Period=FY0).date',
                    'TR.IntangiblesNet(Period=FY0)',
                    'TR.ResearchAndDevelopment(Period=FY0).date',
                    'TR.ResearchAndDevelopment(Period=FY0)',
                    'TR.TotalOperatingExpense(Period=FY0).date',
                    'TR.TotalOperatingExpense(Period=FY0)',
                    'TR.OperatingExpActual(Period=FY0).date',
                    'TR.OperatingExpActual(Period=FY0)',
                    'TR.TotalCurrentAssets(Period=FY0).date',
                    'TR.TotalCurrentAssets(Period=FY0)',
                    'TR.TotalCurrLiabilities(Period=FY0).date',
                    'TR.TotalCurrLiabilities(Period=FY0)',
                ], 
                parameters={'SDate': SDate, 'EDate': EDate},
                field_name=True
            )
            
            data = pd.concat([data, df.drop_duplicates()])
            break

        except:
            time.sleep(20)
            continue

In [None]:
data.drop_duplicates().to_csv("Data\EIKON_Financials.csv", index=False)

### Earnings

In [None]:
data = []

# for yr in range(2007, 2024):
for yr in range(2007, 2010):
# for yr in range(2013, 2019):
# for yr in range(2019, 2024):
    print(yr)
    for o in tqdm(np.array_split(obs, 200)):
        o = o[(o['SDate']<f"{yr}-01-01")&(o['EDate']>f"{yr}-12-31")]
        while True:
            try:
                df, err = ek.get_data(
                    instruments=o["CIK"].tolist(), 
                    fields = [
                            'TR.EPSEstValue().date',
                            'TR.EPSEstValue().periodenddate',
                            'TR.EPSEstValue().analystcode',
                            'TR.EPSEstValue()',
                        ], 
                        parameters={'SDate': f"FY{yr-1}", 'EDate': f'FY{yr}', 'Period': f'FY{yr}'},
                        field_name=True
                    )
                    
                if df is not None:
                    data.append(df.dropna(subset='TR.EPSESTVALUE()').drop_duplicates())

                    gc.collect()
                    break

            except Exception as e:
                print(f"E: {e}")
                time.sleep(1)
                continue

In [10]:
len(data)

113

In [12]:
data_df

Unnamed: 0,Instrument,TR.EPSESTVALUE().DATE,TR.EPSESTVALUE().periodenddate,TR.EPSESTVALUE().analystcode,TR.EPSESTVALUE()
2,1750,2006-02-15T17:20:00Z,2007-05-31,3VBW,1.39895
1,1750,2006-03-17T14:21:00Z,2007-05-31,3PTB,1.61879
4,1750,2006-03-20T18:11:00Z,2007-05-31,11ZW,1.39895
5,1750,2006-06-12T16:11:00Z,2007-05-31,51BZ,1.34899
10,1750,2006-07-13T08:29:00Z,2007-05-31,51BZ,1.39895
...,...,...,...,...,...
1705,1382696,2007-12-03T07:25:00Z,2007-12-31,2XLX,0.63
1704,1382696,2007-12-03T08:41:00Z,2007-12-31,42AG,0.65
1721,1382696,2007-12-13T07:18:00Z,2007-12-31,2XLX,0.66
1812,1382696,2008-02-21T12:29:00Z,2007-12-31,2XLX,0.67


In [11]:
data_df = pd.concat(data)

data_df.dropna(subset=['TR.EPSESTVALUE().DATE', 'TR.EPSESTVALUE()'], inplace=True)

data_df = data_df.sort_values(['Instrument', 'TR.EPSESTVALUE().DATE', 'TR.EPSESTVALUE().periodenddate'])\
    .drop_duplicates(['Instrument', 'TR.EPSESTVALUE().periodenddate', 
                      'TR.EPSESTVALUE().analystcode', 'TR.EPSESTVALUE()'], keep='first')

In [None]:
data_df.to_csv("Data\EIKON_EPSforecast.csv", index=False)

In [None]:
data = []

for o in tqdm(np.array_split(obs, 300)):
    SDate = o["SDate"].min()
    EDate = o["EDate"].max()
    
    while True:
        try:
            df, err = ek.get_data(
                instruments=o["CIK"].tolist(), 
                fields = [
                    "TR.EPSActValue.date",
                    'TR.EPSActValue.announcedate',
                    'TR.EPSActValue.periodenddate',
                    'TR.EPSActValue'
                ], 
                parameters={'SDate': SDate, 'EDate': EDate},
                field_name=True
            )
            
            if df is not None:
                data.append(df.drop_duplicates())

                gc.collect()
                break
                
        except Exception as e:
            print(f"E: {e}")
            time.sleep(1)
            continue

In [None]:
data_df = pd.concat(data)

data_df = data_df.drop_duplicates().sort_values(['Instrument', 'Date']).reset_index(drop=True)

data_df['Date'] = data_df['Date'].dt.date
data_df['Report Date'] = data_df['Report Date'].dt.date

In [None]:
rd.close_session()
data_df.to_csv("Data\EIKON_EPSActual.csv", index=False)

### Ownership

In [None]:
data = []

for o in tqdm(np.array_split(obs, 400)):
    SDate = o["SDate"].min()
    EDate = o["EDate"].max()
    
    while True:
        try:
            df, err = ek.get_data(
                instruments=o["CIK"].tolist(), 
                fields = [
                    'TR.CategoryOwnershipPct().date',
                    'TR.CategoryOwnershipPct().categoryvalue',
                    'TR.CategoryOwnershipPct()',
                ], 
                parameters={'SDate': SDate, 'EDate': EDate, 'StatType': '1'},
                field_name=True
            )
            
            if df is not None:
                data.append(df.drop_duplicates())

                gc.collect()
                break
                
        except Exception as e:
            print(f"E: {e}")
            time.sleep(1)
            continue

In [None]:
data_df = pd.concat(data)
data_df.columns = ['Instrument', 'Date', 'Category Value', 'Percent Of Traded Share']
data_df['Date'] = pd.to_datetime(data_df['Date'], errors='coerce').dt.tz_localize(None)
data_df.dropna(subset=['Date'], inplace=True)
data_df = data_df.drop_duplicates().sort_values(['Instrument', 'Date']).reset_index(drop=True)

In [None]:
data_df.to_csv("Data\EIKON_Ownership1.csv", index=False)

### Clean Data

In [None]:
# Import liberaries and functions
import pandas as pd
import numpy as np

In [None]:
Owner = []
for i in [1,4]:
    Owner.append(pd.read_csv(f"Data\EIKON_Ownership{i}.csv", parse_dates=['Date']))

In [None]:
Owner = pd.concat(Owner).dropna().groupby(['Instrument', 'Date', 'Category Value']).sum().reset_index()

In [None]:
Owner['Category Value'].unique()

In [None]:
Owner.to_csv("Data\EIKON_Ownership.csv", index=False)

In [None]:
EPS = []
for i in [1,2,3,4,5]:
    EPS.append(pd.read_csv(
        f"Data\EIKON_EPSforecast{i}.csv", header=0, 
               parse_dates=['Date', 'Period End Date']
    ))

In [None]:
for i in [0,1,2,3,4]:
    EPS[i]['Date'] = EPS[i]['Date'].dt.date

EPS = pd.concat(EPS).drop_duplicates()

EPS = EPS.sort_values(['Instrument', 'Date']).reset_index(drop=True)

In [None]:
df1 = pd.read_csv(f"Data\EIKON_EPSforecast(1).csv", header=0, parse_dates=['Date', 'Period End Date'])

In [None]:
df2 = pd.read_csv(f"Data\EIKON_EPSforecast(2).csv", header=0, parse_dates=['Date', 'Period End Date'])

In [None]:
df1.drop(columns="Calc Date", inplace=True)

In [None]:
EPS = pd.concat([df1,df2]).drop_duplicates()

In [None]:
EPS.to_csv("Data\EIKON_EPSforecast.csv", index=False)

In [None]:
financials = pd.read_csv("Data\EIKON_Financials.csv")
financials.drop_duplicates(inplace=True)
financials.isna().sum()

In [None]:
# Drop NAs
financials = financials.drop(columns=['0']).dropna(subset=["Instrument"])

financials.dropna(
    subset=financials.columns[1:], how='all', inplace=True)

financials["Instrument"] = financials["Instrument"].astype(int)

In [None]:
dates = financials.columns[1::2]
values = financials.columns[2::2]

dfs = [(
    financials[['Instrument', a, b]]
    .dropna(subset=[a, b])
    .drop_duplicates()
    .set_index(['Instrument', a])
) for a, b in zip(dates, values)]

financials_df = pd.concat(dfs, axis=1).reset_index()
financials_df.drop_duplicates(inplace=True)

In [None]:
financials_df.fillna({"TR.TOTALOPERATINGEXPENSE(PERIOD=FY0)": financials_df["TR.OPERATINGEXPACTUAL(PERIOD=FY0)"]}, inplace=True)
financials_df = financials_df.drop(columns='TR.OPERATINGEXPACTUAL(PERIOD=FY0)').rename(columns={'level_1': 'Date'}).drop_duplicates()
financials_df.to_csv("Data\Financials2.csv", index=False)

In [None]:
analysts = pd.read_csv("Data\Analysts.csv")
analysts.dropna()

In [None]:
FreeFloat = pd.read_csv("Data\EIKON_FreeFloat.csv")
FreeFloat.drop_duplicates(inplace=True)
FreeFloat.isna().sum()

In [None]:
# Drop NAs
FreeFloat = FreeFloat.drop(columns=['0']).dropna(subset=["Instrument"])

FreeFloat.dropna(
    subset=['TR.SHARESFREEFLOAT', 'TR.SHARESFREEFLOAT.DATE',
            'TR.FREEFLOAT', 'TR.FREEFLOAT.DATE'], how='all', inplace=True)

In [None]:
dates = FreeFloat.columns[2::2]
values = FreeFloat.columns[1::2]

dfs = [(
    FreeFloat[['Instrument', a, b]]
    .dropna(subset=[a])
    .drop_duplicates()
    .set_index(['Instrument', a])
) for a, b in zip(dates, values)]

FreeFloat_df = pd.concat(dfs, axis=1).reset_index()
FreeFloat_df.drop_duplicates(inplace=True)

In [None]:
FreeFloat_df["TR.SHARESFREEFLOAT"].fillna(FreeFloat_df["TR.FREEFLOAT"], inplace=True)
FreeFloat_df = FreeFloat_df.drop(columns='TR.FREEFLOAT').rename(columns={'level_1': 'Date'}).drop_duplicates()
FreeFloat_df.to_csv("Data\FreeFloat.csv", index=False)

In [None]:
prices = pd.read_csv("Data\EIKON_prices.csv")
prices.drop_duplicates(inplace=True)
prices.columns

In [None]:
dates = prices.columns[2::2]
values = prices.columns[1::2]

dfs = [(
    prices[['Instrument', a, b]]
    .dropna(subset=[a])
    .drop_duplicates()
    .set_index(['Instrument', a])
) for a, b in zip(dates, values)]

# Handling duplicates for shares outstanding
dfs[5] = dfs[5][
    ~(dfs[5].index.duplicated(keep=False))&(dfs[5]['TR.TTLCMNSHARESOUT(PERIOD=FQ0)'].notna())
]

In [None]:
prices_df = pd.concat(dfs, axis=1).reset_index()

del prices

In [None]:
prices_df['Date'] = pd.to_datetime(prices_df["level_1"]).dt.tz_localize(None)
prices_df.drop(columns=['level_1'], inplace=True)

prices_df.columns = [
    'Instrument', 'CLOSEPRICE', 'OPENPRICE', 'PRICECLOSE', 'VOLUME', 
    'COMPANYMARKETCAP', 'TTLCMNSHARESOUT', 'Date'
]

In [None]:
# Replacing missing close prices with the open price of the same day
prices_df["CLOSEPRICE"].fillna(prices_df['PRICECLOSE'], inplace=True)
prices_df["CLOSEPRICE"].fillna(prices_df["OPENPRICE"], inplace=True)

prices_df.drop(columns=['OPENPRICE', 'PRICECLOSE'], inplace=True)

In [None]:
# Replacing missing TTLCMNSHARESOUT with previouse values
prices_df["TTLCMNSHARESOUT"] = prices_df.groupby(["Instrument"])["TTLCMNSHARESOUT"].fillna(method='ffill')
prices_df["TTLCMNSHARESOUT"] = prices_df.groupby(["Instrument"])["TTLCMNSHARESOUT"].fillna(method='bfill')

prices_df["VOLUME"].fillna(0, inplace=True)

In [None]:
prices_df.isna().sum()

In [None]:
prices_df.to_csv("Data\Prices.csv", index=False)

In [None]:
BidAsk = pd.read_csv("Data\EIKON_bidask.csv").drop_duplicates()
BidAsk.drop_duplicates(inplace=True)
BidAsk.columns

In [None]:
dates = BidAsk.columns[2::2]
values = BidAsk.columns[1::2]

dfs = [(
    BidAsk[['Instrument', a, b]]
    .dropna(subset=[a])
    .drop_duplicates()
    .set_index(['Instrument', a])
) for a, b in zip(dates, values)]

In [None]:
BidAsk_df = pd.concat(dfs, axis=1).reset_index()

del BidAsk

In [None]:
BidAsk_df['Date'] = pd.to_datetime(BidAsk_df["level_1"]).dt.tz_localize(None)
BidAsk_df.drop(columns=['level_1'], inplace=True)

BidAsk_df.columns = [
    'Instrument', 'HIGHPRICE', 'LOWPRICE', 'BIDPRICE', 'ASKPRICE', 'Date'
]

In [None]:
# Replace missing values
BidAsk_df["BIDPRICE"].fillna(BidAsk_df["LOWPRICE"], inplace=True)
BidAsk_df["ASKPRICE"].fillna(BidAsk_df["HIGHPRICE"], inplace=True)

In [None]:
BidAsk_df.to_csv("Data\BidAsk.csv", index=False)

In [None]:
"TR.ARMIntraGlobalScore"
"TR.NumberOfAnalysts(Period=FY1)"