<a href="https://colab.research.google.com/github/Crystal-Reshea/FinBert-Albert-nlp/blob/main/Sentiment_Analysis_with_Finbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis on Item 7 of 10-K form 

In [None]:
pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
# finbert model
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3) 
# finbert tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone') 
# create pipeline for finbert model
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer) 


Downloading:   0%|          | 0.00/533 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/221k [00:00<?, ?B/s]

# Extract Text from 10-K form

In [None]:
import re
def process_text(file_name): 
  # collect only the necessary lines of text
  data = line_collection(file_name)
  # collect ITEM names 
  text, toc = find_toc(data)
  # create list of items in table of contents
  items = list(toc.keys())
  # return dictionary of item content pairs
  return extract_text(text,toc)

In [None]:
def line_collection(file_name):
  data = []
  with open(file_name, 'r') as file: 
    for line in file:  # Reading in file and remove unnessecary lines
      new_line = line.replace('\n',' ')
      # skip lines that are not needed 
      if re.sub(r"\s+", "", line).lower() == "tableofcontents" or len(line) <= 3 or line.startswith("PART"):
        continue
      else:
        # append lines that are headings within Items 
        if len(new_line) >= 8 and len(new_line) <50: 
          if new_line[0].isupper() and "." not in new_line and re.sub(r"\s+", "", new_line).isalpha():
            data.append(line.upper())
          else: 
            data.append(new_line)
        else: 
          data.append(new_line)
  file.close()
  return data

In [None]:
def find_toc(data): 
  toc = {}
  # Adding names of headers to table of contents dictionary
  for line in data: 
    if line.startswith("ITEM") or line == 'SIGNATURES': 
      toc[line] = ""
  # Converting list to string
  text = "".join(data) 
  return(text, toc)

In [None]:
def extract_text(text,toc):
  items = list(toc.keys())
  # Collecting text between headers and adding them to dictionary
  for i in range(1, len(items)): 
    start = items[i-1]
    end = items[i]
    toc[start] = re.search(r'((?<=' + start + ').*(?=' + end + '))', text, re.S | re.M)[0]
  return toc, items

# Extract paragraphs from items

In [None]:
from pprint import pprint as pp

In [None]:
def fill_item_dict(arr_split):
  dict = {}
  for i in range(1,len(arr_split)): 
    heading = re.findall(r'\b[A-Z]+(?:\s+[A-Z]+)*\b',arr_split[i-1])[-1]
    content = arr_split[i]
    dict[heading] = content
  return dict

In [None]:
# process 10-K data
file_name = '/content/drive/MyDrive/NLP_POC/bby-202110k.txt'
text_dict, toc_list = process_text(file_name)

In [None]:
# string of all relevant item 7 content
item7 = text_dict[toc_list[7]]
# list of item 7 content split by new lines
item7_split = item7.split('\n')
# dictionary of all item 7 content organized by headingd
item7_dict = fill_item_dict(item7_split)

In [None]:
item7_headings = list(item7_dict.keys())
pp(item7_headings)

['OF OPERATIONS',
 'OVERVIEW',
 'RESTRUCTURING AND BUSINESS TRANSFORMATION',
 'SUMMARY OF FINANCIAL PERFORMANCE',
 'RESULTS OF OPERATIONS',
 'FISCAL YEAR ENDED',
 'NET SALES',
 'PERCENTAGE',
 'PERCENTAGE CHANGE',
 'COST OF SALES',
 'GROSS PROFIT',
 'GOODWILL AND OTHER IMPAIRMENTS',
 'GAIN ON EXTINGUISHMENT OF DEBT',
 'LOSS BEFORE PROVISION FOR INCOME TAXES',
 'BENEFIT FROM INCOME TAXES',
 'NET LOSS',
 'OPERATING LOSS',
 'INCOME TAXES',
 'TRANSFORMATION',
 'LIQUIDITY AND CAPITAL RESOURCES',
 'TOTAL CONTRACTUAL OBLIGATIONS',
 'SEASONALITY',
 'INFLATION',
 'CRITICAL ACCOUNTING POLICIES']


In [None]:
overview =item7_dict['OVERVIEW']
overview_sent = sent_tokenize(overview)

In [None]:
# def remove_large_sent(): 
#   index = 0
#   to_remove = []
#   for sentence in joined_paragraph_sentences: 
#     if len(sentence) > 512: 
#       to_remove.append(index)
#     index+=1
#   for i in range(len(to_remove)): 
#     del joined_paragraph_sentences[(to_remove[i]-i)]
# remove_large_sent()

# Using Finbert Model on Item 7 for Sentiment Analysis

In [None]:
import pandas as pd
# Creating a dataframe of the collected sentiment scores for each sentence
def sentiment_score_df(arr): 
  sentence_col = []
  neutral_col = []
  positive_col = []
  negative_col = []
  for sentence in arr:
    sentiment_scores = nlp(sentence, return_all_scores=True)
    sentence_col.append(sentence)
    neutral_col.append(sentiment_scores[0][0]['score'])
    positive_col.append(sentiment_scores[0][1]['score'])
    negative_col.append(sentiment_scores[0][2]['score'])
  
  df = pd.DataFrame(list(zip(sentence_col, neutral_col, positive_col, negative_col)),
               columns =['sentence', 'neutral_score', 'positive_score', 'negative_score'])
  return df

In [None]:
overview_df = sentiment_score_df(overview_sent)

In [None]:
overview_df["total"] =(overview_df['neutral_score'] + overview_df['positive_score'] + overview_df['negative_score'])


In [None]:
overview_df.head(10)

Unnamed: 0,sentence,neutral_score,positive_score,negative_score,total
0,We are an omnichannel retailer that makes it e...,0.004513546,0.9954774,9.0876e-06,1.0
1,We sell a wide assortment of merchandise in th...,0.9999969,9.625683e-07,2.197084e-06,1.0
2,"We also operate Decorist, an online interior d...",0.9999437,3.784461e-05,1.845245e-05,1.0
3,"In addition, we are a partner in a joint ventu...",0.9999572,9.241641e-06,3.350702e-05,1.0
4,"For fiscal 2020, 2019 and 2018, we accounted f...",0.9999901,2.239554e-06,7.680431e-06,1.0
5,The Institutional Sales operating segment was ...,0.9999588,6.05757e-07,4.059985e-05,1.0
6,We will continue to account for our operations...,0.9999963,2.530398e-07,3.426756e-06,1.0
7,We have undertaken significant changes over th...,0.9978219,0.001868782,0.0003093244,1.0
8,"During the past year, as the world responded t...",6.427469e-05,0.9993356,0.0006000464,1.0
9,"Similar to many other businesses, the COVID-19...",9.211252e-09,1.0,5.56191e-09,1.0


In [None]:
print(overview_df['neutral_score'].mean())
print(overview_df['positive_score'].mean())
print(overview_df['negative_score'].mean())

0.7026676284015575
0.2954278086262601
0.001904567517599735
