In [115]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
import time
import syllables
import re
from nltk.corpus import stopwords
import yfinance as yf
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
from scipy.stats import linregress

## Way a

Calculate a bunch of textual analysis (Uncertain, Positive, Negative, FOG) on the
10-Ks for all the companies, but for only a 3-year period (not the entire sample).
You can pick whichever 3-year period you want, say for example 2019-2021 to
see how covid affected things if you want. Then do parts 8-10 based on just this
3-year window

In [94]:
# Read the project10K_wtic.xlsx file
file_path = 'project10K_wtic.xlsx'
df = pd.read_excel(file_path)
df.head()

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename,Year,Ticker Symbol
0,320193,APPLE INC,10-K,2010-10-27,edgar/data/320193/0001193125-10-238044.txt,2010,AAPL
1,320193,APPLE INC,10-K,2011-10-26,edgar/data/320193/0001193125-11-282113.txt,2011,AAPL
2,320193,APPLE INC,10-K,2012-10-31,edgar/data/320193/0001193125-12-444068.txt,2012,AAPL
3,320193,APPLE INC,10-K,2013-10-30,edgar/data/320193/0001193125-13-416534.txt,2013,AAPL
4,320193,APPLE INC,10-K,2014-10-27,edgar/data/320193/0001193125-14-383437.txt,2014,AAPL


In [95]:
# Filter the DataFrame to only include 10-K filings for the chosen 3-year period
df = df[df['Year'].isin([2019, 2020, 2021])]
df.head(9)

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename,Year,Ticker Symbol
9,320193,Apple Inc.,10-K,2019-10-31,edgar/data/320193/0000320193-19-000119.txt,2019,AAPL
10,320193,Apple Inc.,10-K,2020-10-30,edgar/data/320193/0000320193-20-000096.txt,2020,AAPL
11,320193,Apple Inc.,10-K,2021-10-29,edgar/data/320193/0000320193-21-000105.txt,2021,AAPL
22,318154,AMGEN INC,10-K,2019-02-13,edgar/data/318154/0000318154-19-000008.txt,2019,AMGN
23,318154,AMGEN INC,10-K,2020-02-12,edgar/data/318154/0000318154-20-000017.txt,2020,AMGN
24,318154,AMGEN INC,10-K,2021-02-09,edgar/data/318154/0000318154-21-000010.txt,2021,AMGN
35,4962,AMERICAN EXPRESS CO,10-K,2019-02-13,edgar/data/4962/0000004962-19-000018.txt,2019,AXP
36,4962,AMERICAN EXPRESS CO,10-K,2020-02-13,edgar/data/4962/0000004962-20-000030.txt,2020,AXP
37,4962,AMERICAN EXPRESS CO,10-K,2021-02-12,edgar/data/4962/0000004962-21-000013.txt,2021,AXP


In [96]:
# Load the Loughran-McDonald dictionaries
with open('LM_Negative.txt', 'r') as f:
    negative_words = set(word.lower() for word in f.read().splitlines())

with open('LM_Positive.txt', 'r') as f:
    positive_words = set(word.lower() for word in f.read().splitlines())

with open('LM_Uncertainty.txt', 'r') as f:
    uncertain_words = set(word.lower() for word in f.read().splitlines())

uncertain_words.remove('')
negative_words.remove('')
positive_words.remove('')

In [97]:
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to C:\Users\donxing/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\donxing/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [98]:
def count_words(text, words_list):
    tokens = word_tokenize(text)
    return sum(1 for token in tokens if token.lower() in words_list)

# Function to calculate FOG index
def fog_index(text):
    words = word_tokenize(text)
    num_words = len(words)
    num_sentences = len(nltk.sent_tokenize(text))
    num_syllables = sum(syllables.estimate(word) for word in words)
    fog_index = 0.4 * ((num_words / num_sentences) + 100 * (num_syllables / num_words))
    return fog_index

def clean_text(text):
    # Remove HTML tags and other non-alphanumeric characters
    clean_text = re.sub('<[^>]*>', ' ', text)  # Remove HTML tags
    clean_text = re.sub('[^0-9a-zA-Z]+', ' ', clean_text)  # Remove non-alphanumeric characters

    tokens = word_tokenize(clean_text)
    # Remove stop words
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    # Reconstruct the text without stop words
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [99]:
# Iterate through each 10-K filing
BASE_LINK = 'https://www.sec.gov/Archives/'
File_Path = 'data/way_a/'

counter = 0
num_exist_files = 9

for index, row in df.iterrows():
    if counter >= num_exist_files:
        break
    
    counter += 1
    real_filename = row['Filename'].split('/')[-1]
    with open(File_Path + real_filename, 'r') as file:
        text = file.read()

    # It doesn't work beacuse network error!!! so I download the files manually
    # Download and parse the 10-K filing
    # response = requests.get(BASE_LINK + row['Filename'], headers=headers)
    # soup = BeautifulSoup(response.text, 'html.parser')
    # text = soup.get_text()
    
    filtered_text = clean_text(text)
    short_text = filtered_text[-20000:]
    
    # Calculate the word counts
    positive_count = count_words(short_text, positive_words)
    negative_count = count_words(short_text, negative_words)
    uncertain_count = count_words(short_text, uncertain_words)

    # Calculate the FOG index
    fog = fog_index(short_text)

    # Store the results in the DataFrame
    df.loc[index, 'Positive'] = positive_count
    df.loc[index, 'Negative'] = negative_count
    df.loc[index, 'Uncertain'] = uncertain_count
    df.loc[index, 'FOG'] = fog


Because the text is so long, it has 7,572,091 words per text, so I just use the last 20,000 words to do the analysis.

In [75]:
df.head(9)

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename,Year,Ticker Symbol,Positive,Negative,Uncertain,FOG
9,320193,Apple Inc.,10-K,2019-10-31,edgar/data/320193/0000320193-19-000119.txt,2019,AAPL,11.0,10.0,0.0,1230.943704
10,320193,Apple Inc.,10-K,2020-10-30,edgar/data/320193/0000320193-20-000096.txt,2020,AAPL,0.0,0.0,1.0,2392.981187
11,320193,Apple Inc.,10-K,2021-10-29,edgar/data/320193/0000320193-21-000105.txt,2021,AAPL,0.0,0.0,0.0,2389.003987
22,318154,AMGEN INC,10-K,2019-02-13,edgar/data/318154/0000318154-19-000008.txt,2019,AMGN,0.0,0.0,0.0,2340.591788
23,318154,AMGEN INC,10-K,2020-02-12,edgar/data/318154/0000318154-20-000017.txt,2020,AMGN,7.0,21.0,4.0,1365.491922
24,318154,AMGEN INC,10-K,2021-02-09,edgar/data/318154/0000318154-21-000010.txt,2021,AMGN,0.0,1.0,0.0,2407.142857
35,4962,AMERICAN EXPRESS CO,10-K,2019-02-13,edgar/data/4962/0000004962-19-000018.txt,2019,AXP,0.0,0.0,0.0,2429.458449
36,4962,AMERICAN EXPRESS CO,10-K,2020-02-13,edgar/data/4962/0000004962-20-000030.txt,2020,AXP,16.0,46.0,30.0,1335.704514
37,4962,AMERICAN EXPRESS CO,10-K,2021-02-12,edgar/data/4962/0000004962-21-000013.txt,2021,AXP,0.0,0.0,0.0,2349.511265


In [None]:
# Add a new column to the DataFrame to store the start date for downloading stock price data
df['price_start_date'] = pd.to_datetime(row['Date Filed']) + pd.DateOffset(days=1)

# Function to download stock price data
def get_stock_prices(ticker, start_date):
    end_date = start_date + pd.DateOffset(days=60)
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data

# Iterate through each 10-K filing
for index, row in df.iterrows():
    ticker = row['Ticker Symbol']
    start_date = row['price_start_date']
    stock_prices = get_stock_prices(ticker, start_date)
    
    # Save the stock price data to a CSV file
    stock_prices.to_csv(f'{ticker}_{start_date.date()}_stock_prices.csv')


In [None]:
# Add a new column to the DataFrame to store the start date for downloading stock price data
df['price_start_date'] = df['date'] + pd.DateOffset(days=1)

# Function to download stock price data
def get_stock_prices(ticker, start_date):
    end_date = start_date + pd.DateOffset(days=60)
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data

# Iterate through each 10-K filing
for index, row in df.iterrows():
    ticker = row['ticker']
    start_date = row['price_start_date']
    stock_prices = get_stock_prices(ticker, start_date)
    
    # Save the stock price data to a CSV file
    stock_prices.to_csv(f'{ticker}_{start_date.date()}_stock_prices.csv')

In [None]:
# Function to calculate daily returns and their variance
def calculate_volatility(stock_prices):
    stock_prices['daily_returns'] = stock_prices['Adj Close'].pct_change()
    volatility = stock_prices['daily_returns'].var()
    return volatility

# Iterate through each 10-K filing and calculate the volatility
for index, row in df.iterrows():
    ticker = row['ticker']
    start_date = row['price_start_date']
    
    # Load the stock price data from the CSV file
    stock_prices = pd.read_csv(f'{ticker}_{start_date.date()}_stock_prices.csv', index_col='Date', parse_dates=True)
    
    volatility = calculate_volatility(stock_prices)
    df.loc[index, 'Volatility'] = volatility


In [None]:


# Scatterplot and linear regression
def scatterplot_and_regression(df, x_col, y_col):
    sns.regplot(x=x_col, y=y_col, data=df)
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.show()

    X = df[x_col]
    y = df[y_col]
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    print(model.summary())

# Analyze the correlation between word content and volatility
scatterplot_and_regression(df, 'Positive', 'Volatility')
scatterplot_and_regression(df, 'Negative', 'Volatility')
scatterplot_and_regression(df, 'Uncertain', 'Volatility')


## way b

Calculate a bunch of textual analysis (Uncertain, Positive, Negative, FOG) on the
10-Ks for just ONE company, but for the entire sample period (2010-2022). You
can pick whichever company you want. Then do parts 8-10 based on just this one
company.

In [84]:
# Read the project10K_wtic.xlsx file
file_path = 'project10K_wtic.xlsx'
df = pd.read_excel(file_path)

# Filter the DataFrame to only include 10-K filings for the chosen company
chosen_company = 'AAPL'  # Replace 'AAPL' with the desired company symbol
df = df[df['Ticker Symbol'] == chosen_company]

In [91]:
File_Path = 'data/way_b/'

for index, row in df.iterrows():
    real_filename = row['Filename'].split('/')[-1]
    with open(File_Path + real_filename, 'r') as file:
        text = file.read()

    # Calculate the word counts
    positive_count = count_words(text, positive_words)
    negative_count = count_words(text, negative_words)
    uncertain_count = count_words(text, uncertain_words)

    # Calculate the FOG index
    fog = fog_index(text)

    # Store the results in the DataFrame
    df.loc[index, 'Positive'] = positive_count
    df.loc[index, 'Negative'] = negative_count
    df.loc[index, 'Uncertain'] = uncertain_count
    df.loc[index, 'FOG'] = fog


In [93]:
df.head()

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename,Year,Ticker Symbol,Positive,Negative,Uncertain,FOG
0,320193,APPLE INC,10-K,2010-10-27,edgar/data/320193/0001193125-10-238044.txt,2010,AAPL,0.0,0.0,0.0,2349.511265
1,320193,APPLE INC,10-K,2011-10-26,edgar/data/320193/0001193125-11-282113.txt,2011,AAPL,0.0,0.0,0.0,2349.511265
2,320193,APPLE INC,10-K,2012-10-31,edgar/data/320193/0001193125-12-444068.txt,2012,AAPL,0.0,0.0,0.0,2349.511265
3,320193,APPLE INC,10-K,2013-10-30,edgar/data/320193/0001193125-13-416534.txt,2013,AAPL,0.0,0.0,0.0,2349.511265
4,320193,APPLE INC,10-K,2014-10-27,edgar/data/320193/0001193125-14-383437.txt,2014,AAPL,0.0,0.0,0.0,2349.511265


In [None]:
def get_stock_data(ticker, start_date, days=60):
    end_date = start_date + datetime.timedelta(days=days)
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data

ticker = 'AAPL'  
report_date = '2018-11-01'
stock_data = get_stock_data(ticker, report_date)


In [None]:
def calculate_volatility(stock_data):
    stock_data['Returns'] = stock_data['Close'].pct_change()
    volatility = stock_data['Returns'].var()
    return volatility

volatility = calculate_volatility(stock_data)
