In [None]:
import time
import os
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys


def write_page(url, file_path):
    """Takes in the URL and writes the html file to the path specified."""
    driver = webdriver.Chrome()
    driver.get(url)
    
    try:
        xpath_convert_to_html = r'//*[@id="form-information-html"]'
        xpath_button_click =r'//*[@id="menu-dropdown-link"]'
        
        driver.find_element("xpath", xpath_button_click).click()
        correct_url =  driver.find_element("xpath", xpath_convert_to_html).get_attribute('href')
        
        driver.quit()
        driver = webdriver.Chrome()
        driver.get(correct_url)
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(driver.page_source)
        driver.quit()
    except:
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(driver.page_source)
        driver.quit()

def download_files_10k(ticker, dest_folder):
    """Downloads all the html 10-k files for the given ticker into the destination folder."""
    # Create the destination folder if it doesn't exist
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)

    # Construct the URL to search for the ticker's filings
    url = r'https://www.sec.gov/edgar/searchedgar/companysearch'

    # Open the search page and enter the ticker in the search box
    driver = webdriver.Chrome()
    driver.get(url)
    xpath_search_box = r'//*[@id="edgar-company-person"]'
    driver.find_element("xpath", xpath_search_box).send_keys(ticker,Keys.ENTER)

    # Wait for page to load and expand 10-K dropdown
    time.sleep(2)
    xpath_expand_selected = r'//*[@id="filingsStart"]/div[2]/div[3]/h5/a'
    driver.find_element("xpath", xpath_expand_selected).click()

    # views only 10-K and 10-Q data
    time.sleep(1)
    xpath_obtain_all_data = r'//*[@id="filingsStart"]/div[2]/div[3]/div/button[1]'
    driver.find_element("xpath", xpath_obtain_all_data).click()

    # Searches 10-K to only show the relevant filings
    xpath_search_10K = r'//*[@id="searchbox"]'
    driver.find_element("xpath", xpath_search_10K).send_keys('10-K',Keys.ENTER)

    # Download each 10-K filing
    
    # Download each 10-K filing
    table_xpath = r'//*[@id="filingsTable"]'
    wait = WebDriverWait(driver, 1)
    wait.until(EC.presence_of_element_located((By.XPATH, table_xpath)))
    table = driver.find_element(By.XPATH, table_xpath)
    rows = table.find_elements(By.XPATH, './tbody/tr')

    counter = 1
    for row in rows:
        cells = row.find_elements(By.XPATH, './td')
        filing_text = cells[2].text      
        xpath_url = '//*[@id="filingsTable"]/tbody/tr['+str(counter)+']/td[2]/div/a[1]'
        filing_url = driver.find_element("xpath", xpath_url).get_attribute('href')
        file_name = f"{ticker}_10-K_{filing_text}.html"
        file_path = os.path.join(dest_folder, file_name)
        counter += 1
        write_page(filing_url, file_path)
        
    driver.quit()

In [None]:
from bs4 import BeautifulSoup
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

def clean_html_text(html_text):
    '''
    Function uses BeautifulSoup to parse html text and replaces any tags
    or punctuation with a space. '\W' refers to matching any non-word 
    defined by regex and '+' refers to matching more than one non-words
    '''
    soup = BeautifulSoup(html_text, 'html.parser')
    stopword_list = stopwords.words('english')
    paragraphs = soup.find_all('p')
    return_list = []
    
    for p in paragraphs:
        
        if not p.find_previous('h1'):
            
            text = p.get_text(strip=True)
            cleaned = re.sub(r'[^A-Za-z0-9]', ' ', text.lower())
            clean_text_tokenized = word_tokenize(cleaned)
            
            for i in stopword_list:
                if i in clean_text_tokenized:
                    clean_text_tokenized.remove(i)
                    
            lemmatizer = WordNetLemmatizer()
            clean_text_lemmatized = [lemmatizer.lemmatize(j) for j in clean_text_tokenized]
            regex = re.compile(r'\d+')
            
            no_numbers = [item for item in clean_text_lemmatized if not regex.search(item)]
            no_letters = [word for word in no_numbers if len(word) != 1]
            
            if len(no_letters) > 8:
                return_list.append(no_letters)
    
    result = '\n'.join([' '.join(inner_list) for inner_list in return_list])
    
    return result   

def write_clean_html_text_files(input_folder, dest_folder):
    '''
    Function reads the files in the input folder and and calls the clean_html_text function
    and stores a text file in the destination directory.
    
    '''
    
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    
    for filename in os.listdir(input_folder):
        input_file_path = os.path.join(input_folder, filename)
        new_name = filename.split('.')
        
        with open(input_file_path,'r' ,encoding="utf8") as f:
            html_text = f.read()
            cleaned_text = clean_html_text(html_text)
            
        dest_filename = f'{new_name[0]}.txt' 
        dest_file_path = os.path.join(dest_folder, dest_filename)
    
        
        with open(dest_file_path, 'w') as f:
            f.write(cleaned_text)


In [None]:
folder = r'D:\Users\Arian\Documents\Kubrick\Week 10\EDGAR Project\data\10k_filings_raw'
ticker = 'AAPL'
download_files_10k(ticker, folder)

In [None]:
input_folder = r'D:\Users\Arian\Documents\Kubrick\Week 10\EDGAR Project\data\10k_filings_raw'
output_folder = r'D:\Users\Arian\Documents\Kubrick\Week 10\EDGAR Project\data\10k_filings_clean'
write_clean_html_text_files(input_folder, output_folder)

In [1]:
input_folder = r'D:\Users\Arian\Documents\Kubrick\Week 10\EDGAR Project\data\10k_filings_raw\AAPL_10-K_2012-10-31.html'
import requests
import bs4 as BeautifulSoup
# with open(input_folder,'r' ,encoding="utf8") as f:
#     html_text = f.read()

html_text = r'https://www.sec.gov/Archives/edgar/data/320193/000119312515356351/d17062d10k.htm'
user_agent = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'

response = requests.get(html_text, headers={'User-Agent':user_agent})
print(response.text)
soup = BeautifulSoup(response.content, 'html.parser')
for tag in soup.find_all():
    print(tag.name)

stopword_list = stopwords.words('english')

paragraphs = soup.find_all('p')
return_list = []
for p in paragraphs:
    if not p.find_previous('h1'):
        text = p.get_text(strip=True)
        clean_text = re.sub(r'[^A-Za-z0-9]', ' ', text.lower())
        clean_text_tokenized = word_tokenize(clean_text)
        for i in stopword_list:
            if i in clean_text_tokenized:
                clean_text_tokenized.remove(i)
        lemmatizer = WordNetLemmatizer()
        clean_text_lemmatized = [lemmatizer.lemmatize(j) for j in clean_text_tokenized]
        
        regex = re.compile(r'\d+')
        
        no_numbers = [item for item in clean_text_lemmatized if not regex.search(item)]
        
        no_letters = [word for word in no_numbers if len(word) != 1]
        if len(no_letters) > 8:
            return_list.append(no_letters)
        
result = '\n'.join([' '.join(inner_list) for inner_list in return_list])
print(result)
        
#         # if len(text.strip()) != 0:
#         #     print(p.text.lower())


# # Finds what the html contains:
# divs = soup.find_all('div')
# for div in divs:
#     spans = div.find_all('span')
#     for span in spans:
#         print(span.text)

<DOCUMENT>
<TYPE>10-K
<SEQUENCE>1
<FILENAME>d17062d10k.htm
<DESCRIPTION>FORM 10-K
<TEXT>
<HTML><HEAD>
<TITLE>Form 10-K</TITLE>
<script >bazadebezolkohpepadr="739232188"</script><script type="text/javascript" src="https://www.sec.gov/akam/13/2c0fca09"  defer></script></HEAD>
 <BODY BGCOLOR="WHITE">
<h5 align="left"><a href="#toc">Table of Contents</a></h5>

 <P STYLE="line-height:1.0pt;margin-top:0pt;margin-bottom:0pt;border-bottom:1px solid #000000">&nbsp;</P>
<P STYLE="line-height:3.0pt;margin-top:0pt;margin-bottom:2pt;border-bottom:1px solid #000000">&nbsp;</P> <P STYLE="margin-top:0pt; margin-bottom:0pt; font-size:13pt; font-family:Arial" ALIGN="center"><B>UNITED STATES </B></P>
<P STYLE="margin-top:0pt; margin-bottom:0pt; font-size:13pt; font-family:Arial" ALIGN="center"><B>SECURITIES AND EXCHANGE COMMISSION </B></P>
<P STYLE="margin-top:0pt; margin-bottom:0pt; font-size:11pt; font-family:Arial" ALIGN="center"><B>Washington, D.C. 20549 </B></P> <P STYLE="font-size:2pt;margin-top:0p

TypeError: 'module' object is not callable

In [1]:
# Part 3A:
import ref_data as rf

df_returns = rf.get_yahoo_data('2000-01-01', '2020-08-01', 'MMM')
display(df_returns)

Unnamed: 0,high,low,volume,formatted_date,1daily_return,2daily_return,3daily_return,5daily_return,10daily_return
0,25.53125,23.87500,2493200,2000-01-03,-0.002488,,,,
1,24.87500,24.09375,1527000,2000-01-04,0.008861,0.026119,,,
2,25.00000,24.12500,1755400,2000-01-05,-0.015267,-0.010127,0.007463,,
3,25.00000,24.15625,1663200,2000-01-06,0.011250,-0.006361,-0.001266,,
4,25.50000,24.53125,2078400,2000-01-07,-0.031726,-0.016250,-0.034351,-0.011194,
...,...,...,...,...,...,...,...,...,...
5173,6.44000,6.12000,26162900,2020-07-27,0.035714,0.051908,0.025118,0.041667,0.038700
5174,6.55000,6.15000,23990700,2020-07-28,-0.037097,0.001553,0.018321,0.009245,0.062682
5175,6.72000,6.42000,22529400,2020-07-29,-0.004644,-0.046774,-0.007764,-0.018838,0.064842
5176,6.45000,6.18000,18802400,2020-07-30,0.020472,0.037152,-0.003226,0.050382,0.095930


In [1]:
import ref_data as rf
rf.get_sentiment_word_dict()['Negative']

['ABANDON',
 'ABANDONED',
 'ABANDONING',
 'ABANDONMENT',
 'ABANDONMENTS',
 'ABANDONS',
 'ABDICATED',
 'ABDICATES',
 'ABDICATING',
 'ABDICATION',
 'ABDICATIONS',
 'ABERRANT',
 'ABERRATION',
 'ABERRATIONAL',
 'ABERRATIONS',
 'ABETTING',
 'ABNORMAL',
 'ABNORMALITIES',
 'ABNORMALITY',
 'ABNORMALLY',
 'ABOLISH',
 'ABOLISHED',
 'ABOLISHES',
 'ABOLISHING',
 'ABROGATE',
 'ABROGATED',
 'ABROGATES',
 'ABROGATING',
 'ABROGATION',
 'ABROGATIONS',
 'ABRUPT',
 'ABRUPTLY',
 'ABRUPTNESS',
 'ABSENCE',
 'ABSENCES',
 'ABSENTEEISM',
 'ABUSE',
 'ABUSED',
 'ABUSES',
 'ABUSING',
 'ABUSIVE',
 'ABUSIVELY',
 'ABUSIVENESS',
 'ACCIDENT',
 'ACCIDENTAL',
 'ACCIDENTALLY',
 'ACCIDENTS',
 'ACCUSATION',
 'ACCUSATIONS',
 'ACCUSE',
 'ACCUSED',
 'ACCUSES',
 'ACCUSING',
 'ACQUIESCE',
 'ACQUIESCED',
 'ACQUIESCES',
 'ACQUIESCING',
 'ACQUIT',
 'ACQUITS',
 'ACQUITTAL',
 'ACQUITTALS',
 'ACQUITTED',
 'ACQUITTING',
 'ADULTERATE',
 'ADULTERATED',
 'ADULTERATING',
 'ADULTERATION',
 'ADULTERATIONS',
 'ADVERSARIAL',
 'ADVERSARIES',
 

In [2]:
def get_sentiment_word_dict():
    df1 = pd.read_csv('LM-dictionary-2021.csv')
    sentiment_dict = {}
    sentiment_words = ['Negative', 'Positive', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining']
    
    for n in sentiment_words:
        sentiment_dict[n] = []
        for i in range(0, len(df1.index)):
            if df1.loc[i, n] != 0:
                sentiment_dict[n].append(df1.loc[i, 'Word'])

    return sentiment_dict

In [4]:
import csv
import pandas as pd
import yahoofinancials
from yahoofinancials import YahooFinancials

In [13]:
data = YahooFinancials('MMM').get_historical_price_data('2000-01-01', '2020-08-01', 'daily')
data

{'MMM': {'eventsData': {'dividends': {'2000-02-23': {'amount': 0.29,
     'date': 951316200,
     'formatted_date': '2000-02-23'},
    '2000-05-17': {'amount': 0.29,
     'date': 958570200,
     'formatted_date': '2000-05-17'},
    '2000-08-23': {'amount': 0.29,
     'date': 967037400,
     'formatted_date': '2000-08-23'},
    '2000-11-21': {'amount': 0.29,
     'date': 974817000,
     'formatted_date': '2000-11-21'},
    '2001-02-21': {'amount': 0.3,
     'date': 982765800,
     'formatted_date': '2001-02-21'},
    '2001-05-16': {'amount': 0.3,
     'date': 990019800,
     'formatted_date': '2001-05-16'},
    '2001-08-22': {'amount': 0.3,
     'date': 998487000,
     'formatted_date': '2001-08-22'},
    '2001-11-20': {'amount': 0.3,
     'date': 1006266600,
     'formatted_date': '2001-11-20'},
    '2002-02-20': {'amount': 0.31,
     'date': 1014215400,
     'formatted_date': '2002-02-20'},
    '2002-05-22': {'amount': 0.31,
     'date': 1022074200,
     'formatted_date': '2002-05-22'

In [5]:
df1 = pd.read_csv('LM-dictionary-2021.csv')
df1.head()

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Syllables,Source
0,AARDVARK,1,354,1.55e-08,1.42e-08,3.82e-06,99,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,3,1.31e-10,8.65e-12,9.24e-09,1,0,0,0,0,0,0,0,2,12of12inf
2,ABACI,3,9,3.94e-10,1.17e-10,5.29e-08,7,0,0,0,0,0,0,0,3,12of12inf
3,ABACK,4,29,1.27e-09,6.65e-10,1.6e-07,28,0,0,0,0,0,0,0,2,12of12inf
4,ABACUS,5,8570,3.75e-07,3.81e-07,3.53e-05,1108,0,0,0,0,0,0,0,3,12of12inf


In [6]:
df1.index

RangeIndex(start=0, stop=86531, step=1)

In [None]:
def get_yahoo_data(start_date,end_date,tickers):


    for ticker in tickers:
        data = YahooFinancials(ticker).get_historical_price_data(start_date, end_date, 'daily')
        prices = pd.DataFrame(data[ticker]['prices'])
        prices['1daily_return'] = (prices['open'] - prices['close'])/prices['open']
        prices['2daily_return'] = (prices['open'].shift(1) - prices['close'])/prices['open'].shift(1)
        prices['3daily_return'] = (prices['open'].shift(2) - prices['close'])/prices['open'].shift(2)
        prices['5daily_return'] = (prices['open'].shift(4) - prices['close'])/prices['open'].shift(4)
        prices['10daily_return'] = (prices['open'].shift(9) - prices['close'])/prices['open'].shift(9)
        prices.drop(columns = ['date','adjclose','open','close'],inplace=True)
        return prices

In [None]:
data = YahooFinancials("MMM").get_historical_price_data(start_date, end_date, 'daily')

In [1]:
import pandas as pd 
from ref_data import get_sentiment_word_dict
import os

def write_document_sentiments(input_folder, output_file):
    
    sentiment_df = pd.DataFrame(columns=['Symbol', 'ReportType', 'FilingDate'] + list(get_sentiment_word_dict().keys()))
    sentiment_dict = get_sentiment_word_dict()
    
    for filename in os.listdir(input_folder):
        input_file_path = os.path.join(input_folder, filename)
        
        with open(input_file_path, 'r') as f:
            text = f.read()
            new_name = filename.split('_')
            symbol, report_type, filing_date = new_name[:3]
            row = {'Symbol': symbol, 'ReportType': report_type, 'FilingDate': filing_date.split('.')[0]}
            
            for word in text.split(): 
                for key, value in sentiment_dict.items():         
                    if word in sentiment_dict[key]:
                        row[key] = row.get(key, 0) + 1
            
            sentiment_df.loc[len(sentiment_df)] = row
    
    sentiment_df.to_csv(output_file, index=False)

In [4]:
sentiment_df = pd.DataFrame(columns=['Symbol', 'ReportType', 'FilingDate'] + list(get_sentiment_word_dict().keys()))
sentiment_dict = get_sentiment_word_dict()
sentiment_dict

{'Negative': ['abandon',
  'abandoned',
  'abandoning',
  'abandonment',
  'abandonments',
  'abandons',
  'abdicated',
  'abdicates',
  'abdicating',
  'abdication',
  'abdications',
  'aberrant',
  'aberration',
  'aberrational',
  'aberrations',
  'abetting',
  'abnormal',
  'abnormalities',
  'abnormality',
  'abnormally',
  'abolish',
  'abolished',
  'abolishes',
  'abolishing',
  'abrogate',
  'abrogated',
  'abrogates',
  'abrogating',
  'abrogation',
  'abrogations',
  'abrupt',
  'abruptly',
  'abruptness',
  'absence',
  'absences',
  'absenteeism',
  'abuse',
  'abused',
  'abuses',
  'abusing',
  'abusive',
  'abusively',
  'abusiveness',
  'accident',
  'accidental',
  'accidentally',
  'accidents',
  'accusation',
  'accusations',
  'accuse',
  'accused',
  'accuses',
  'accusing',
  'acquiesce',
  'acquiesced',
  'acquiesces',
  'acquiescing',
  'acquit',
  'acquits',
  'acquittal',
  'acquittals',
  'acquitted',
  'acquitting',
  'adulterate',
  'adulterated',
  'adult

In [7]:
import csv
import pandas as pd
import yahoofinancials
from yahoofinancials import YahooFinancials

In [5]:
def get_yahoo_data(start_date,end_date,tickers):


    for ticker in tickers:
        data = YahooFinancials(ticker).get_historical_price_data(start_date, end_date, 'daily')
        prices = pd.DataFrame(data[ticker]['prices'])
        prices['1daily_return'] = (prices['open'] - prices['close'])/prices['open']
        prices['2daily_return'] = (prices['open'].shift(1) - prices['close'])/prices['open'].shift(1)
        prices['3daily_return'] = (prices['open'].shift(2) - prices['close'])/prices['open'].shift(2)
        prices['5daily_return'] = (prices['open'].shift(4) - prices['close'])/prices['open'].shift(4)
        prices['10daily_return'] = (prices['open'].shift(9) - prices['close'])/prices['open'].shift(9)
        prices.drop(columns = ['date','adjclose','open','close'],inplace=True)
        return prices

In [8]:
get_yahoo_data('2000-01-01', '2020-08-01', 'MMM')

Unnamed: 0,high,low,volume,formatted_date,1daily_return,2daily_return,3daily_return,5daily_return,10daily_return
0,25.53125,23.87500,2493200,2000-01-03,-0.002488,,,,
1,24.87500,24.09375,1527000,2000-01-04,0.008861,0.026119,,,
2,25.00000,24.12500,1755400,2000-01-05,-0.015267,-0.010127,0.007463,,
3,25.00000,24.15625,1663200,2000-01-06,0.011250,-0.006361,-0.001266,,
4,25.50000,24.53125,2078400,2000-01-07,-0.031726,-0.016250,-0.034351,-0.011194,
...,...,...,...,...,...,...,...,...,...
5173,6.44000,6.12000,26162900,2020-07-27,0.035714,0.051908,0.025118,0.041667,0.038700
5174,6.55000,6.15000,23990700,2020-07-28,-0.037097,0.001553,0.018321,0.009245,0.062682
5175,6.72000,6.42000,22529400,2020-07-29,-0.004644,-0.046774,-0.007764,-0.018838,0.064842
5176,6.45000,6.18000,18802400,2020-07-30,0.020472,0.037152,-0.003226,0.050382,0.095930


In [3]:
import pandas as pd
import ref_data as rf
tickers=rf.get_sp100()
list=[]
for i in tickers[:10]:
    print(i)
    try:
        list.append(rf.get_yahoo_data('2019-01-01','2020-08-01',i))
    except:
        continue

final_df=pd.concat(list)
final_df

AAPL
ABBV
ABT
ACN
ADBE
AIG
AMD
AMGN
AMT
AMZN


Unnamed: 0_level_0,high,low,volume,price,1daily_return,2daily_return,3daily_return,5daily_return,10daily_return,Symbol
formatted_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-02,39.712502,38.557499,148158800,38.047047,-0.099607,-0.061170,-0.063260,-0.029192,-0.018870,AAPL
2019-01-03,36.430000,35.500000,365248800,34.257282,0.042689,0.040368,0.060201,0.081651,0.096139,AAPL
2019-01-04,37.137501,35.950001,234428400,35.719688,-0.002226,0.016795,0.034062,0.027182,0.057736,AAPL
2019-01-07,37.207500,36.474998,219111200,35.640202,0.019063,0.036369,0.039681,0.013993,0.036301,AAPL
2019-01-08,37.955002,37.130001,164101200,36.319611,0.016982,0.020232,0.010216,0.015390,0.021028,AAPL
...,...,...,...,...,...,...,...,...,...,...
2020-07-27,154.899994,150.788498,83410000,152.760498,-0.017963,-0.007096,-0.001090,,,AMZN
2020-07-28,153.854507,149.787994,62534000,150.016495,0.011065,0.017181,0.054777,,,AMZN
2020-07-29,151.957993,149.838501,59482000,151.676498,0.006049,0.043233,,,,AMZN
2020-07-30,154.600006,150.250000,122566000,152.593994,0.036961,,,,,AMZN


In [2]:
tickers

['AAPL',
 'ABBV',
 'ABT',
 'ACN',
 'ADBE',
 'AIG',
 'AMD',
 'AMGN',
 'AMT',
 'AMZN',
 'AVGO',
 'AXP',
 'BA',
 'BAC',
 'BK',
 'BKNG',
 'BLK',
 'BMY',
 'BRK.B',
 'C',
 'CAT',
 'CHTR',
 'CL',
 'CMCSA',
 'COF',
 'COP',
 'COST',
 'CRM',
 'CSCO',
 'CVS',
 'CVX',
 'DHR',
 'DIS',
 'DOW',
 'DUK',
 'EMR',
 'EXC',
 'F',
 'FDX',
 'GD',
 'GE',
 'GILD',
 'GM',
 'GOOG',
 'GOOGL',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'KHC',
 'KO',
 'LIN',
 'LLY',
 'LMT',
 'LOW',
 'MA',
 'MCD',
 'MDLZ',
 'MDT',
 'MET',
 'META',
 'MMM',
 'MO',
 'MRK',
 'MS',
 'MSFT',
 'NEE',
 'NFLX',
 'NKE',
 'NVDA',
 'ORCL',
 'PEP',
 'PFE',
 'PG',
 'PM',
 'PYPL',
 'QCOM',
 'RTX',
 'SBUX',
 'SCHW',
 'SO',
 'SPG',
 'T',
 'TGT',
 'TMO',
 'TMUS',
 'TSLA',
 'TXN',
 'UNH',
 'UNP',
 'UPS',
 'USB',
 'V',
 'VZ',
 'WBA',
 'WFC',
 'WMT',
 'XOM',
 'Downloaded from Barchart.com as of 03-07-2023 05:25am CST']

In [4]:
# importing csv module
import csv
 
# csv file name
filename = "merged_data.csv"
 
# initializing the titles and rows list
fields = []
rows = []
 
# reading csv file
with open(filename, 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)
     
    # extracting field names through first row
    fields = next(csvreader)
 
    # extracting each data row one by one
    for row in csvreader:
        rows.append(row)
 
    # get total number of rows
    print("Total no. of rows: %d"%(csvreader.line_num))
 
# printing the field names
print('Field names are:' + ', '.join(field for field in fields))
 
# printing first 5 rows
print('\nFirst 5 rows are:\n')
for row in rows[:5]:
    # parsing each column of a row
    for col in row:
        print("%10s"%col,end=" "),
    print('\n')

Total no. of rows: 788
Field names are:, Symbol, ReportType, FilingDate, Negative, Positive, Uncertainty, Litigious, Strong_Modal, Weak_Modal, Constraining, high, low, volume, price, 1daily_return, 2daily_return, 3daily_return, 5daily_return, 10daily_return, Negative_Diff, Positive_Diff, Uncertainty_Diff, Litigous_Diff, Strong_Modal_Diff, Weak_Modal_Diff, Contraining_Diff

First 5 rows are:

         0       AAPL       10-K 31/10/2012        602        294        649        313         69        283        271 21.4985714 20.98928642  510003200 18.17368698 0.00204932 -0.031109313 -0.017973534 -0.062688973 -0.098165695          0          0          0          0          0          0          0 

         1       AAPL       10-K 30/10/2013        651        281        661        333         69        289        279 18.84000015 18.46500015  354163600 16.40571785 -0.00419137 -0.009277992 0.003524434 -0.007582417 -0.008134891 8.139534884 -4.421768707 1.848998459 6.389776358          0 2.120

In [7]:
fields

['',
 'Symbol',
 'ReportType',
 'FilingDate',
 'Negative',
 'Positive',
 'Uncertainty',
 'Litigious',
 'Strong_Modal',
 'Weak_Modal',
 'Constraining',
 'high',
 'low',
 'volume',
 'price',
 '1daily_return',
 '2daily_return',
 '3daily_return',
 '5daily_return',
 '10daily_return',
 'Negative_Diff',
 'Positive_Diff',
 'Uncertainty_Diff',
 'Litigous_Diff',
 'Strong_Modal_Diff',
 'Weak_Modal_Diff',
 'Contraining_Diff']

In [17]:
rows[100]

['110',
 'BKNG',
 '10-K',
 '19/02/2015',
 '1292',
 '540',
 '1235',
 '998',
 '144',
 '646',
 '398',
 '1222.880005',
 '1195.359985',
 '2671500',
 '1218.050049',
 '-0.001494248',
 '-0.008595782',
 '0.001428505',
 '0.019046796',
 '0.011370613',
 '4.630969609',
 '8.293838863',
 '4.16',
 '7.867132867',
 '8.653846154',
 '0.386100386',
 '10.12987013']

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

df= pd.DataFrame([i for i in rows],columns=fields)
cols = fields
sns.heatmap(df[cols].corr(), cmap = 'BuGn', annot = True)
plt.show()


  sns.heatmap(df[cols].corr(), cmap = 'BuGn', annot = True)


ValueError: zero-size array to reduction operation fmin which has no identity