In [2]:
from nltk.tokenize import sent_tokenize

from lib.config.config_loader import ConfigLoader

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


from tqdm import tqdm

config = ConfigLoader().load_config()
tqdm.pandas()

In [3]:
input_dim = config['models']['input_dim']

In [4]:
df = pd.read_csv('./data/processed/reports_labeled.csv')
# df = df.loc[:100].copy()
#df = df[df['form'] == '10-K'].copy()
df.reset_index(drop=True, inplace=True)
df['sentences'] = df['mda'].progress_apply(lambda x: sent_tokenize(x))

train_df = df[df['year'] <= 2019].copy()
test_df = df[df['year'] > 2019].copy()

# train_df = df.loc[:80].copy().reset_index(drop=True)
# test_df = df.iloc[80:].copy().reset_index(drop=True)

100%|██████████| 13017/13017 [00:36<00:00, 361.33it/s]


In [5]:
df['year'].unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
       2021])

In [6]:
train_df['label'].value_counts()

label
0    9145
1    1874
Name: count, dtype: int64

In [7]:
train_df['year'].unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [8]:
test_df['label'].value_counts()

label
0    1696
1     302
Name: count, dtype: int64

In [9]:
test_df['year'].unique()

array([2020, 2021])

In [10]:
train_corpus = [sentence for sentences in train_df['sentences'] for sentence in sentences]

vectorizer = TfidfVectorizer(max_features=input_dim, stop_words='english')
vectorizer.fit(train_corpus)

def get_tfidf_embeddings(sentence_list):
    if not type(sentence_list) == list:
        sentence_list = [sentence_list]
    embeddings = vectorizer.transform(sentence_list)
    return embeddings

# print("Train Sentence: ")
# train_df['tfidf_sentence'] = train_df['sentences'].progress_apply(get_tfidf_embeddings)
# print("Test Sentence: ")
# test_df['tfidf_sentence'] = test_df['sentences'].progress_apply(get_tfidf_embeddings)

print("Train MDA: ")
train_df['tfidf_mda'] = train_df['mda'].progress_apply(get_tfidf_embeddings)
print("Test MDA: ")
test_df['tfidf_mda'] = test_df['mda'].progress_apply(get_tfidf_embeddings)


Train MDA: 


100%|██████████| 11019/11019 [00:33<00:00, 324.72it/s]


Test MDA: 


100%|██████████| 1998/1998 [00:05<00:00, 345.56it/s]


In [11]:
X_train, y_train = train_df['tfidf_mda'], train_df['label'].to_numpy()
X_test, y_test = test_df['tfidf_mda'], test_df['label'].to_numpy()

X_train = [x.toarray()[0] for x in X_train]
X_test = [x.toarray()[0] for x in X_test]

In [11]:
models = {
    "Gaussian NB": GaussianNB(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
}

predictions = {}

In [12]:
for name, model in models.items():
    print("Training " + name)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    predictions[name] = {
        'y_pred': y_pred,
        'y_test': y_test,
    }


Training Gaussian NB
Training Logistic Regression
Training Random Forest
Training XGBoost


In [13]:
def evaluate(y_true, y_pred):
    accuracy = round(accuracy_score(y_true, y_pred), 4)
    precision = round(precision_score(y_true, y_pred, zero_division=0), 4)
    recall = round(recall_score(y_true, y_pred, zero_division=0), 4)
    f1 = round(f1_score(y_true, y_pred, zero_division=0), 4)

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    return accuracy, precision, recall, f1, tp, tn, fp, fn

In [18]:
results = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1": [],
    "TP": [],
    "TN": [],
    "FP": [],
    "FN": []
}
for name, pred_dict in predictions.items():
    print("Evaluating " + name)

    accuracy, precision, recall, f1, tp, tn, fp, fn = evaluate(pred_dict['y_test'], pred_dict['y_pred'])
    results["Model"].append(name)
    results["Accuracy"].append(accuracy)
    results["Precision"].append(precision)
    results["Recall"].append(recall)
    results["F1"].append(f1)
    results["TP"].append(tp)
    results["TN"].append(tn)
    results["FP"].append(fp)
    results["FN"].append(fn)

    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1} | TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
    print("#"*100)
results = pd.DataFrame(results)

Evaluating Gaussian NB
Accuracy: 0.5195, Precision: 0.1998, Recall: 0.7252, F1: 0.3133 | TP: 219, TN: 819, FP: 877, FN: 83
####################################################################################################
Evaluating Logistic Regression
Accuracy: 0.8493, Precision: 0.6667, Recall: 0.0066, F1: 0.0131 | TP: 2, TN: 1695, FP: 1, FN: 300
####################################################################################################
Evaluating Random Forest
Accuracy: 0.8544, Precision: 0.6667, Recall: 0.0728, F1: 0.1313 | TP: 22, TN: 1685, FP: 11, FN: 280
####################################################################################################
Evaluating XGBoost
Accuracy: 0.8438, Precision: 0.4242, Recall: 0.0927, F1: 0.1522 | TP: 28, TN: 1658, FP: 38, FN: 274
####################################################################################################


In [19]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,TP,TN,FP,FN
0,Gaussian NB,0.5195,0.1998,0.7252,0.3133,219,819,877,83
1,Logistic Regression,0.8493,0.6667,0.0066,0.0131,2,1695,1,300
2,Random Forest,0.8544,0.6667,0.0728,0.1313,22,1685,11,280
3,XGBoost,0.8438,0.4242,0.0927,0.1522,28,1658,38,274


In [20]:
results.to_csv('./outputs/ml_results.csv', index=False)