## Combine the bankruptcy and sentiment analysis models

In [1]:
# Import dependencies
import tensorflow as tf 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
# Model improvement and Evaluation 
from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score

In [5]:
# import data
# create validation set
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'data/concern_reports_nostopwords/train', 
    batch_size=batch_size, 
    validation_split=0.17, 
    subset='training', 
    seed=seed)

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'data/going_concern_reports/train', 
    batch_size=batch_size, 
    validation_split=0.17,
    subset='validation', 
    seed=seed)

raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'data/going_concern_reports/test', 
    batch_size=batch_size)

Found 141611 files belonging to 2 classes.
Using 117538 files for training.
Found 141611 files belonging to 2 classes.
Using 24073 files for validation.
Found 24991 files belonging to 2 classes.


In [22]:
# import bankruptcy and sentiment analysis models
bankruptcy_model = tf.keras.models.load_model('model_checkpoints/model4/ckpt_200')

tf.keras.utils.get_custom_objects()["custom_standardization"] = custom_standardization
going_concern_model = tf.keras.models.load_model('trained_models/going_concern_model')

### preprocess data for sentiment analysis model

In [7]:
import string 
import regex as re

# remove HTML tags from the text, remove punctuation, and convert to lowercase
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<p>', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [25]:
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [26]:
# convert text to numbers without labels, then adapt to the data
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [27]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [28]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

### Numerical Bank Data

In [10]:
# read bank data for bankruptcy model
training = pd.read_csv('data/train_american_bankruptcy.csv')
testing = pd.read_csv('data/test_american_bankruptcy.csv')
validation = pd.read_csv('data/validate_american_bankruptcy.csv')

In [15]:
X_test = testing.drop(['status'], axis=1)

In [39]:
# Load and preprocess the data
# X_text = preprocess_text(going_concern_report_text)
# X_numeric = preprocess_numeric(numeric_data)
# X_numeric = X_numeric.reshape(1, -1)  # Reshape to match expected input shape of bankruptcy model

# Get the output probabilities from the two models
y_bankruptcy = bankruptcy_model.predict(X_test)
y_going_concern = going_concern_model.predict(raw_test_ds)



In [43]:
# Concatenate the two outputs into a single input vector
X_combined = tf.concat([y_bankruptcy, y_going_concern], axis=1)
print(X_combined)

# Create a new model to predict the impact of the going-concern report
impact_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(2,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the impact model
impact_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Make predictions using the impact model
impact_probability = impact_model.predict(X_combined)

# Print the predicted impact probability
print('Impact probability:', impact_probability[0])

InvalidArgumentError: {{function_node __wrapped__ConcatV2_N_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} ConcatOp : Dimension 0 in both shapes must be equal: shape[0] = [3132,1] vs. shape[1] = [1566,1] [Op:ConcatV2] name: concat

In [37]:
y_bankruptcy

array([[1.00000000e+00, 5.46639637e-38],
       [0.00000000e+00, 1.00000000e+00],
       [7.38122361e-03, 9.92618740e-01],
       ...,
       [1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00],
       [8.86165738e-01, 1.13834225e-01]], dtype=float32)

0=no 1=yes	
Indicates the auditor's opinion contains an explanatory paragraph regarding the going concern assumption.

Auditors include an explanitory paragraph when they conclude there is substantial doubt in a company as a 'going concern.'

In [36]:
y_going_concern

array([[0.03828335],
       [0.44369203],
       [0.9488943 ],
       ...,
       [0.15050896],
       [0.07220834],
       [0.99812067]], dtype=float32)