In [1]:
from lib.database.database_connector import DatabaseConnector
import json
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from tqdm import tqdm

In [2]:
database_connector = DatabaseConnector("./data/database.db")
cursor = database_connector.cursor

split_date = "2020-01-01"

In [3]:
train_data = cursor.execute(
    "SELECT sentences, label FROM embeddings WHERE filing_date < '2020-01-01';"
).fetchall()

test_data = cursor.execute(
    "SELECT sentences, label FROM embeddings WHERE filing_date >= '2020-01-01';"
).fetchall()

In [4]:
train_data = [(json.loads(sentences), label) for sentences, label in train_data]
test_data = [(json.loads(sentences), label) for sentences, label in test_data]

In [5]:
# 1) Word2Vec için tüm cümleleri tokenize edip corpus oluşturun
corpus = []
for dataset in (train_data, test_data):
    for sentences, _ in dataset:
        for s in sentences:
            corpus.append(simple_preprocess(s))

In [6]:
# 2) Word2Vec modelini eğitin
w2v_model = Word2Vec(
    sentences=corpus,
    vector_size=768,   # kelime gömme boyutu
    window=5,          # bağlam penceresi
    min_count=1,       # en az 1 defa geçen kelimeleri alın
    workers=8          # paralel iş parçacığı sayısı
)

In [7]:
# 3) Her örneği (birden fazla cümle) tek vektöre indirgeyen fonksiyon
def vectorize_dataset(data, model):
    X, y = [], []
    for sentences, label in data:
        sent_vecs = []
        for s in sentences:
            tokens = simple_preprocess(s)
            # modele gömmesi olan kelimeleri alın
            vecs = [model.wv[t] for t in tokens if t in model.wv]
            if len(vecs) > 0:
                sent_vecs.append(np.mean(vecs, axis=0))
        # örnek vektörü, cümle vektörlerinin ortalaması
        if len(sent_vecs) > 0:
            X.append(np.mean(sent_vecs, axis=0))
        else:
            X.append(np.zeros(model.vector_size))
        y.append(label)
    return np.vstack(X), np.array(y)

In [8]:
# 4) Eğitim ve test matrislerini oluşturun
X_train, y_train = vectorize_dataset(train_data, w2v_model)
X_test, y_test   = vectorize_dataset(test_data,  w2v_model)

In [9]:
# ---- Logistic Regression ----
lr = LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr   = lr.predict(X_test)
y_proba_lr  = lr.predict_proba(X_test)[:,1]

print("=== Logistic Regression ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_lr))
print("Precision:      ", precision_score(y_test, y_pred_lr))
print("Recall:         ", recall_score(y_test, y_pred_lr))
print("F1-score:       ", f1_score(y_test, y_pred_lr))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lr).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))

=== Logistic Regression ===
Accuracy:        0.8529698149951315
Precision:       0.5
Recall:          0.039735099337748346
F1-score:        0.0736196319018405
TP: 6.000
TN: 870.000
FP: 6.000
FN: 145.000


In [10]:
rf = RandomForestClassifier(
    n_estimators=10,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf  = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print("=== Random Forest Regression ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_rf))
print("Precision:      ", precision_score(y_test, y_pred_rf))
print("Recall:         ", recall_score(y_test, y_pred_rf))
print("F1-score:       ", f1_score(y_test, y_pred_rf))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))

=== Random Forest Regression ===
Accuracy:        0.8412852969814996
Precision:       0.3
Recall:          0.059602649006622516
F1-score:        0.09944751381215469
TP: 9.000
TN: 855.000
FP: 21.000
FN: 142.000


In [11]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
y_proba_knn = knn.predict_proba(X_test)[:,1]

print("=== KNN Regression ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_knn))
print("Precision:      ", precision_score(y_test, y_pred_knn))
print("Recall:         ", recall_score(y_test, y_pred_knn))
print("F1-score:       ", f1_score(y_test, y_pred_knn))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_knn).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))


=== KNN Regression ===
Accuracy:        0.7877312560856865
Precision:       0.2184873949579832
Recall:          0.17218543046357615
F1-score:        0.1925925925925926
TP: 26.000
TN: 783.000
FP: 93.000
FN: 125.000
