In [1]:
from lib.database.database_connector import DatabaseConnector
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from tqdm import tqdm


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cagatay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
database_connector = DatabaseConnector("./data/database.db")
cursor = database_connector.cursor

split_date = "2020-01-01"

In [3]:
train_data = cursor.execute(
    "SELECT embeddings, label FROM embeddings WHERE filing_date < '2020-01-01';"
).fetchall()

test_data = cursor.execute(
    "SELECT embeddings, label FROM embeddings WHERE filing_date >= '2020-01-01';"
).fetchall()

In [None]:
train_data = [(np.mean(json.loads(embeddings), axis=0), label) for embeddings, label in tqdm(train_data, desc="Training")]
test_data = [(np.mean(json.loads(embeddings), axis=0), label) for embeddings, label in tqdm(test_data, desc="Testing")]

In [None]:
X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)

In [None]:
lr = LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr   = lr.predict(X_test)
y_proba_lr  = lr.predict_proba(X_test)[:,1]

print("=== Logistic Regression ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_lr))
print("Precision:      ", precision_score(y_test, y_pred_lr))
print("Recall:         ", recall_score(y_test, y_pred_lr))
print("F1-score:       ", f1_score(y_test, y_pred_lr))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lr).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))

In [None]:
rf = RandomForestClassifier(
    n_estimators=10,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf  = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print("=== Random Forest Regression ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_rf))
print("Precision:      ", precision_score(y_test, y_pred_rf))
print("Recall:         ", recall_score(y_test, y_pred_rf))
print("F1-score:       ", f1_score(y_test, y_pred_rf))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
y_proba_knn = knn.predict_proba(X_test)[:,1]
print("=== KNN Classifier ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_knn))
print("Precision:      ", precision_score(y_test, y_pred_knn))
print("Recall:         ", recall_score(y_test, y_pred_knn))
print("F1-score:       ", f1_score(y_test, y_pred_knn))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_knn).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))