## Combine the bankruptcy and sentiment analysis models

In [1]:
# Import dependencies
import tensorflow as tf 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
# Model improvement and Evaluation 
from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score

In [2]:
import string 
import regex as re

# remove HTML tags from the text, remove punctuation, and convert to lowercase
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<p>', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [3]:
# Register the custom_standardization function as a custom object
tf.keras.utils.get_custom_objects()["custom_standardization"] = custom_standardization

In [4]:
# import bankruptcy and sentiment analysis models
bankruptcy_model = tf.keras.models.load_model('model_checkpoints/model4/ckpt_200')

tf.keras.utils.get_custom_objects()["custom_standardization"] = custom_standardization
going_concern_model = tf.keras.models.load_model('data/trained_models/going_concern_model')

In [6]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming you have a dataset with features, bankruptcy probabilities, and sentiment analysis scores
# X_features: Features from the first neural network
# y_bankruptcy: Categorical probability of bankruptcy
# sentiment_scores: Sentiment analysis scores from the second neural network

bank_data = pd.read_csv("data/matched_data.csv")
numerical_only = bank_data.select_dtypes(include=['float64','int64'])
X_bonk = numerical_only.drop(['status','COMPANY_FKEY','GOING_CONCERN','FILE_DATE','fyear'], axis=1)
y_bonk = bank_data['status']

X_go = bank_data['OPINION_TEXT1']
y_go = bank_data['GOING_CONCERN']

In [7]:
X_bonk.head()

Unnamed: 0,cik,current_assets,total_assets,cost_of_goods_sold,total_long_term_debt,depreciation_and_amortization,ebit,ebitda,gross_profit,inventory,total_current_liabilities,net_income,retained_earnings,total_receivables,total_revenue,market_value,total_liabilities,net_sales,total_operating_expenses
0,1750,913.985,1703.727,1408.071,329.802,59.296,137.016,196.312,367.711,507.274,416.01,69.826,467.485,296.489,1775.782,1049.8206,868.438,1775.782,1579.47
1,1750,1063.272,2195.653,1662.408,669.489,80.333,142.36,222.693,412.09,599.752,473.226,67.723,486.582,324.879,2074.498,485.2897,1329.631,2074.498,1851.805
2,1750,1033.7,2136.9,1714.5,622.2,108.6,136.6,245.2,452.6,582.9,389.0,55.0,542.4,315.4,2167.1,790.0029,1217.4,2167.1,1921.9
3,1750,1116.9,2199.5,1581.4,564.3,113.4,142.6,256.0,453.6,632.9,402.1,72.9,616.7,297.9,2035.0,961.308,1198.8,2035.0,1779.0
4,1750,954.1,1515.0,1342.7,85.0,92.3,-8.6,83.7,251.6,566.7,412.0,10.2,603.9,231.1,1594.3,1046.3954,669.9,1594.3,1510.6


In [8]:
sentiment_scores = going_concern_model.predict(X_go)



In [18]:
sentiment_scores

array([[0.9704408 ],
       [0.9436229 ],
       [0.9542874 ],
       ...,
       [0.9835624 ],
       [0.9804181 ],
       [0.81047916]], dtype=float32)

In [15]:
bonk_results = bankruptcy_model.predict(X_bonk)



In [19]:
bonk_results

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [21]:
import numpy as np

# Assuming you have the categorical results in the 'categorical_results' array

# Convert categorical results to binary results
binary_results = np.argmax(bonk_results, axis=1)
binary_results = np.expand_dims(binary_results, axis=1)

print(binary_results)

[[0]
 [0]
 [0]
 ...
 [1]
 [1]
 [1]]


In [40]:
my_man = list(zip(binary_results, sentiment_scores))

In [47]:
X_combined = tf.keras.layers.Concatenate(axis=1)([binary_results.astype(float), sentiment_scores])

In [53]:
X_combined = X_combined.numpy()

In [55]:
X_combined

array([[0.        , 0.9704408 ],
       [0.        , 0.9436229 ],
       [0.        , 0.9542874 ],
       ...,
       [1.        , 0.9835624 ],
       [1.        , 0.9804181 ],
       [1.        , 0.81047916]], dtype=float32)

In [56]:
from sklearn.model_selection import train_test_split

# Assuming X_combined contains the input features and y_bonk contains the binary labels

# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_bonk, test_size=0.2, random_state=42)

# Further split the train set into train and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("Train set shapes:", X_train.shape, y_train.shape)
print("Validation set shapes:", X_val.shape, y_val.shape)
print("Test set shapes:", X_test.shape, y_test.shape)

Train set shapes: (17207, 2) (17207,)
Validation set shapes: (4302, 2) (4302,)
Test set shapes: (5378, 2) (5378,)


In [74]:
y_hot_train = tf.one_hot(y_train,2)
y_hot_val = tf.one_hot(y_val,2)
y_hot_test = tf.one_hot(y_test,2)

In [27]:
# from sklearn.model_selection import train_test_split

# # split data into training (70%), validation (15%), and testing (15%) sets 
# X_, X_test, y_, y_test = train_test_split(X_combined, y_bonk, train_size=0.8, test_size=0.15, random_state=42, shuffle=True)
# X_train, X_validate, y_train, y_validate = train_test_split(X_, y_, train_size=0.82, test_size=0.18, random_state=42, shuffle=True)

TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got array([18524,  2290, 26625, ..., 12312, 12951, 17589])

In [91]:
# Create a new model to predict the impact of the going-concern report
impact_model = tf.keras.Sequential([
    tf.keras.layers.Dense(6, activation='relu', input_shape=(2,)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(2, activation='softmax')
])

# Compile the impact model
impact_model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001), metrics=['accuracy'])
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
# # Make predictions using the impact model
# impact_probability = impact_model.predict(X_combined)

# # Print the predicted impact probability
# print('Impact probability:', impact_probability[0])

SyntaxError: invalid syntax (2461498177.py, line 3)

In [90]:
mod = impact_model.fit(X_train, y_hot_train, epochs=20, batch_size=2, validation_data=(X_val, y_hot_val), callbacks=[callback])

Epoch 1/20

KeyboardInterrupt: 

In [82]:
resulto = impact_model.evaluate(X_test, y_hot_test)



In [55]:
# Perform feature scaling if necessary
scaler = StandardScaler()
X_features_scaled = scaler.fit_transform(X_features)

# Split the data into training and test sets
X_train, X_test, y_train, y_test, sentiment_train, sentiment_test = train_test_split(
    X_features_scaled, y_bankruptcy, sentiment_scores, test_size=0.2, random_state=42
)

# Create the model architecture
input_features = Input(shape=(X_train.shape[1],), name='features')
input_sentiment = Input(shape=(1,), name='sentiment')

# Neural network for features
hidden_layer = Dense(32, activation='relu')(input_features)
output_features = Dense(1, activation='sigmoid', name='features_output')(hidden_layer)

# Neural network for sentiment scores
output_sentiment = Dense(1, activation='sigmoid', name='sentiment_output')(input_sentiment)

# Concatenate the outputs of both neural networks
concatenated = Concatenate()([output_features, output_sentiment])

# Final output layer
output = Dense(1, activation='sigmoid', name='final_output')(concatenated)

# Create the model
model = Model(inputs=[input_features, input_sentiment], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([X_train, sentiment_train], y_train, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate([X_test, sentiment_test], y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Extract feature importances
feature_importances = model.get_weights()[0]  # Get the weights of the first layer

# Perform analysis on feature importances
# ...


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.16139346361160278
Test Accuracy: 0.9570472240447998


In [60]:
import numpy as np

# Assuming the model has been trained and evaluated on the test set

# Extract feature importances from the first layer's weights
feature_importances = np.abs(model.get_weights()[0])  # Get the weights of the first layer and take absolute values

# Normalize feature importances
normalized_importances = feature_importances / np.sum(feature_importances)

# Define the feature names
feature_names = [
    'current_assets', 'cost_of_goods_sold', 'depreciation_and_amortization',
    'ebitda', 'inventory', 'net_income', 'total_receivables', 'market_value',
    'net_sales', 'total_assets', 'total_long_term_debt', 'ebit', 'gross_profit',
    'total_current_liabilities', 'retained_earnings', 'total_revenue', 'total_liabilities',
    'total_operating_expenses'
]

# Sort and print the feature importances in descending order
sorted_importances_indices = np.argsort(normalized_importances)[::-1]
for index in sorted_importances_indices:
    feature_name = feature_names[index]
    importance = normalized_importances[index]
    print(f"{feature_name}: {importance}")

# Perform further analysis on feature importances
# ...


TypeError: only integer scalar arrays can be converted to a scalar index

In [2]:
# import data
# create validation set
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'data/aligned_concerns_nostop/train', 
    batch_size=batch_size, 
    validation_split=0.17, 
    subset='training', 
    seed=seed)

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'data/aligned_concerns_nostop/train', 
    batch_size=batch_size, 
    validation_split=0.17,
    subset='validation', 
    seed=seed)

raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'data/aligned_concerns_nostop/test', 
    batch_size=batch_size)

Found 22614 files belonging to 2 classes.
Using 18770 files for training.
Found 22614 files belonging to 2 classes.
Using 3844 files for validation.
Found 4019 files belonging to 2 classes.


### preprocess data for sentiment analysis model

In [10]:
import string 
import regex as re

# remove HTML tags from the text, remove punctuation, and convert to lowercase
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<p>', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [11]:
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [12]:
# convert text to numbers without labels, then adapt to the data
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [13]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [14]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

### Numerical Bank Data

In [18]:
import pandas as pd
import numpy as np

# import data
bank_data = pd.read_csv("data/matched_data.csv")

#### Split: train, test, validate

In [23]:
numerical_only = bank_data.select_dtypes(include=['float64','int64'])
concerns = bank_data['OPINION_TEXT1','GOING_CONCERN']
X = numerical_only.drop(['status','COMPANY_FKEY','FILE_DATE','fyear'], axis=1)
y = numerical_only['status']

X.head()

Unnamed: 0,cik,current_assets,total_assets,cost_of_goods_sold,total_long_term_debt,depreciation_and_amortization,ebit,ebitda,gross_profit,inventory,total_current_liabilities,net_income,retained_earnings,total_receivables,total_revenue,market_value,total_liabilities,net_sales,total_operating_expenses
0,1750,913.985,1703.727,1408.071,329.802,59.296,137.016,196.312,367.711,507.274,416.01,69.826,467.485,296.489,1775.782,1049.8206,868.438,1775.782,1579.47
1,1750,1063.272,2195.653,1662.408,669.489,80.333,142.36,222.693,412.09,599.752,473.226,67.723,486.582,324.879,2074.498,485.2897,1329.631,2074.498,1851.805
2,1750,1033.7,2136.9,1714.5,622.2,108.6,136.6,245.2,452.6,582.9,389.0,55.0,542.4,315.4,2167.1,790.0029,1217.4,2167.1,1921.9
3,1750,1116.9,2199.5,1581.4,564.3,113.4,142.6,256.0,453.6,632.9,402.1,72.9,616.7,297.9,2035.0,961.308,1198.8,2035.0,1779.0
4,1750,954.1,1515.0,1342.7,85.0,92.3,-8.6,83.7,251.6,566.7,412.0,10.2,603.9,231.1,1594.3,1046.3954,669.9,1594.3,1510.6


In [10]:
# read bank data for bankruptcy model
training = pd.read_csv('data/train_american_bankruptcy.csv')
testing = pd.read_csv('data/test_american_bankruptcy.csv')
validation = pd.read_csv('data/validate_american_bankruptcy.csv')

In [15]:
X_test = testing.drop(['status'], axis=1)

In [24]:
from sklearn.model_selection import train_test_split

# split data into training (70%), validation (15%), and testing (15%) sets 
X_, X_test, y_, y_test = train_test_split(X, y, train_size=0.8, test_size=0.15, random_state=42, shuffle=True)
X_train, X_validate, y_train, y_validate = train_test_split(X_, y_, train_size=0.82, test_size=0.18, random_state=42, shuffle=True)

print("y_train data distribution:\n", y_train.value_counts())
print("y_validate data distribution:\n", y_validate.value_counts())
print("y_test data distribution:\n", y_test.value_counts())

input_shape = (X.shape[1],)

y_train data distribution:
 0    16998
1      639
Name: status, dtype: int64
y_validate data distribution:
 0    3710
1     162
Name: status, dtype: int64
y_test data distribution:
 0    3866
1     168
Name: status, dtype: int64


In [25]:
# Load and preprocess the data
# X_text = preprocess_text(going_concern_report_text)
# X_numeric = preprocess_numeric(numeric_data)
# X_numeric = X_numeric.reshape(1, -1)  # Reshape to match expected input shape of bankruptcy model

# Get the output probabilities from the two models
y_bankruptcy = bankruptcy_model.predict(X_train)
y_going_concern = going_concern_model.predict(raw_train_ds)



In [26]:
y_predict_bank = np.round(y_bankruptcy)# Convert probabilities to binary predictions using a threshold of 0.5

In [28]:
bonkrupt = np.argmax(y_predict_bank, axis=1)

In [None]:
combined_inputs = tf.keras.layers.concatenate([y_predict_bank, output2], axis=1)

In [29]:
# Concatenate the two outputs into a single input vector
X_combined = tf.concat([bonkrupt, y_going_concern], axis=1)
print(X_combined)

# Create a new model to predict the impact of the going-concern report
impact_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(2,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the impact model
impact_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Make predictions using the impact model
impact_probability = impact_model.predict(X_combined)

# Print the predicted impact probability
print('Impact probability:', impact_probability[0])

InvalidArgumentError: {{function_node __wrapped__ConcatV2_N_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} ConcatOp : Expected concatenating dimensions in the range [-1, 1), but got 1 [Op:ConcatV2] name: concat

In [37]:
y_bankruptcy

array([[1.00000000e+00, 5.46639637e-38],
       [0.00000000e+00, 1.00000000e+00],
       [7.38122361e-03, 9.92618740e-01],
       ...,
       [1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00],
       [8.86165738e-01, 1.13834225e-01]], dtype=float32)

0=no 1=yes	
Indicates the auditor's opinion contains an explanatory paragraph regarding the going concern assumption.

Auditors include an explanitory paragraph when they conclude there is substantial doubt in a company as a 'going concern.'

In [36]:
y_going_concern

array([[0.03828335],
       [0.44369203],
       [0.9488943 ],
       ...,
       [0.15050896],
       [0.07220834],
       [0.99812067]], dtype=float32)