# Sentiment Analysis with Stopwords Removed
Testing sentiment analysis with the stopwords removed from the going concern reports.
The file `going-concern-198/audit_analytics_data.ipynb` contains the model and results for training without removing stopwords.

In [1]:
import pandas as pd
import numpy as np

col_list = ['COMPANY_FKEY','GOING_CONCERN','OPINION_TEXT1','OPINION_TEXT2','OPINION_TEXT3']
bank_data = pd.read_csv('data/Audit Analytics 01.2010.csv', usecols=col_list)
# GOING_CONCERN

  exec(code_obj, self.user_global_ns, self.user_ns)


## Remove stopwords & Save to folders based on label
GOING_CONCERN: 0=no 1=yes

Indicates the auditor's opinion contains an explanatory paragraph regarding the going concern assumption.

Auditors include an explanitory paragraph when they conclude there is substantial doubt in a company as a 'going concern.'
For this reason, this will be used as the label for a supervised learning model.

In [33]:
import os
import random
import shutil

import pandas as pd

import tensorflow as tf

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered)

# Load data from CSV file
data = pd.read_csv('data/going_concerns.csv')

# Shuffle data
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split data into training and testing sets
train_ratio = 0.85
test_ratio = 0.15

num_samples = len(data)
train_cutoff = int(num_samples * train_ratio)
# test_cutoff = int((num_samples * test_ratio))

train_data = data.iloc[:train_cutoff]
test_data = data.iloc[train_cutoff:]
# val_data = data.iloc[test_cutoff:]

# Create output directories if they don't exist
output_dir = './data/concern_reports_nostopwords'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

train_dir = os.path.join(output_dir, 'train')
if not os.path.exists(train_dir):
    os.makedirs(train_dir)

test_dir = os.path.join(output_dir, 'test')
if not os.path.exists(test_dir):
    os.makedirs(test_dir)

# val_dir = os.path.join(output_dir, 'val')
# if not os.path.exists(val_dir):
#     os.makedirs(val_dir)

# Convert data to the required directory structure
def convert_data(data, dirname):
    for label in [0, 1]:
        label_data = data[data['GOING_CONCERN'] == label]
        label_dir = os.path.join(dirname, str(label))
        # create directory for label if it doesn't exist
        if not os.path.exists(label_dir):
            os.makedirs(label_dir)
        
        
        # create a new text file for each report titled by company id + index
        for idx, row in label_data.iterrows():
            filename = f"{row['COMPANY_FKEY']}_{idx}.txt"
            filepath = os.path.join(label_dir, filename)
            text = remove_stopwords(row['OPINION_TEXT1'])

            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(text)

convert_data(train_data, train_dir)
convert_data(test_data, test_dir)
# convert_data(val_data, val_dir)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/evanaholevas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
import os

# folder path
dir_path2 = 'data/concern_reports_nostopwords/train'
count2 = 0
for root_dir, cur_dir, files in os.walk(dir_path2):
    count2 += len(files)
print('Train count:', count2)

# folder path
dir_path = 'data/concern_reports_nostopwords/test'
count = 0
for root_dir, cur_dir, files in os.walk(dir_path):
    count += len(files)
print('test count:', count)

Train count: 141611
test count: 24991


## Set training and validation data

In [2]:
import tensorflow as tf

# create validation set
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'data/concern_reports_nostopwords/train', 
    batch_size=batch_size, 
    validation_split=0.17, 
    subset='training', 
    seed=seed)

Found 141611 files belonging to 2 classes.
Using 117538 files for training.


In [5]:
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(3):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

Review b"Report Independent Registered Public Accounting Firm Board Directors Stockholders NCR Corporation Opinions Financial Statements Internal Control Financial Reporting audited accompanying consolidated balance sheets NCR Corporation subsidiaries ( \xe2\x80\x9c Company \xe2\x80\x9d ) December 31 , 2020 2019 , related consolidated statements operations , comprehensive income ( loss ) , changes stockholders \xe2\x80\x99 equity cash flows three years period ended December 31 , 2020 , including related notes financial statement schedule listed index appearing Item 15 ( ) ( 2 ) ( collectively referred \xe2\x80\x9c consolidated financial statements \xe2\x80\x9d ) . also audited Company 's internal control financial reporting December 31 , 2020 , based criteria established Internal Control - Integrated Framework ( 2013 ) issued Committee Sponsoring Organizations Treadway Commission ( COSO ) . opinion , consolidated financial statements referred present fairly , material respects , financ

In [3]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'data/going_concern_reports/train', 
    batch_size=batch_size, 
    validation_split=0.17,
    subset='validation', 
    seed=seed)

Found 141611 files belonging to 2 classes.
Using 24073 files for validation.


In [4]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'data/going_concern_reports/test', 
    batch_size=batch_size)

Found 24991 files belonging to 2 classes.


## Preprocess & clean data

In [5]:
import string 
import regex as re

# remove HTML tags from the text, remove punctuation, and convert to lowercase
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<p>', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [6]:
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [7]:
# convert text to numbers without labels, then adapt to the data
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [8]:
# preview the preprocessed data
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [17]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[9999])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[2000])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

1287 --->  •determining
 313 --->  met
Vocabulary size: 10000


In [9]:
# apply text vectorization layer

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [10]:
embedding_dim = 16
# define the model
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_features + 1, embedding_dim),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1)])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160016    
                                                                 
 dropout (Dropout)           (None, None, 16)          0         
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
__________________________________________________

In [11]:
# binary cross entropy loss function for binary classification of text
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.00008)

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.5))

In [18]:
# train the model
# epochs = 10
# history = model.fit(
#     train_ds,
#     validation_data=val_ds,
#     epochs=epochs)

history = model.fit(train_ds,
          validation_data=val_ds,
          epochs=20,
          batch_size=32,
          callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=3)]
          )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20


In [19]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.19345645606517792
Accuracy:  0.9165299534797668


## Export the model

In [20]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  tf.keras.layers.Activation('sigmoid')
])

export_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

# Testing with the raw input `raw_test_ds`
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

0.9272137880325317


In [21]:
import tensorflow as tf
from sklearn.metrics import confusion_matrix

# Set up the test data
test_dir = 'data/concern_reports_nostopwords/test'
batch_size = 32
test_data = tf.keras.preprocessing.text_dataset_from_directory(
    test_dir,
    batch_size=batch_size)

# Make predictions on the test set
y_pred = export_model.predict(raw_test_ds)

# Get the true labels
y_true = []
for _, labels in test_data:
    y_true.extend(labels.numpy())

# Convert probabilities to binary labels
y_pred = [1 if p >= 0.5 else 0 for p in y_pred]

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Calculate precision, recall, and f1 score
TN, FP, FN, TP = cm.ravel()
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Found 24991 files belonging to 2 classes.
Precision: 0.1696085955487337
Recall: 0.10783117833617956
F1 score: 0.13184190902311707
