In [1]:
from lib.database.database_connector import DatabaseConnector
import json
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from tqdm import tqdm

In [2]:
database_connector = DatabaseConnector("./data/database.db")
cursor = database_connector.cursor

split_date = "2020-01-01"

In [3]:
train_data = cursor.execute(
    "SELECT sentences, label FROM embeddings WHERE filing_date < '2020-01-01';"
).fetchall()

test_data = cursor.execute(
    "SELECT sentences, label FROM embeddings WHERE filing_date >= '2020-01-01';"
).fetchall()

In [5]:
num_merge = sum([y for x, y in train_data])

In [6]:
num_merge

1339

In [7]:
num_not_merge = len(train_data) - num_merge

In [8]:
num_not_merge

6539

In [4]:
# train_data = train_data[:100]
# test_data = test_data[:10]

In [5]:
train_data = [(json.loads(sentences), label) for sentences, label in train_data]
test_data = [(json.loads(sentences), label) for sentences, label in test_data]

In [6]:
# 1) Word2Vec için tüm cümleleri tokenize edip corpus oluşturun
corpus = []
for dataset in (train_data, test_data):
    for sentences, _ in dataset:
        for s in sentences:
            corpus.append(simple_preprocess(s))

In [7]:
# 2) Word2Vec modelini eğitin
w2v_model = Word2Vec(
    sentences=corpus,
    vector_size=768,   # kelime gömme boyutu
    window=5,          # bağlam penceresi
    min_count=1,       # en az 1 defa geçen kelimeleri alın
    workers=8          # paralel iş parçacığı sayısı
)

In [8]:
# 3) Her örneği (birden fazla cümle) tek vektöre indirgeyen fonksiyon
def vectorize_dataset(data, model):
    X, y = [], []
    for sentences, label in data:
        sent_vecs = []
        for s in sentences:
            tokens = simple_preprocess(s)
            # modele gömmesi olan kelimeleri alın
            vecs = [model.wv[t] for t in tokens if t in model.wv]
            if len(vecs) > 0:
                sent_vecs.append(np.mean(vecs, axis=0))
        # örnek vektörü, cümle vektörlerinin ortalaması
        if len(sent_vecs) > 0:
            X.append(np.mean(sent_vecs, axis=0))
        else:
            X.append(np.zeros(model.vector_size))
        y.append(label)
    return np.vstack(X), np.array(y)

In [9]:
# 4) Eğitim ve test matrislerini oluşturun
X_train, y_train = vectorize_dataset(train_data, w2v_model)
X_test, y_test   = vectorize_dataset(test_data,  w2v_model)

In [10]:
# 5) Basit bir Logistic Regression sınıflayıcıyla eğitin ve test edin
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [11]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [12]:
print("Accuracy: {:.3f}".format(acc))
print("Precision: {:.3f}".format(prec))
print("Recall: {:.3f}".format(rec))
print("F1: {:.3f}".format(f1))

Accuracy: 0.851
Precision: 0.429
Recall: 0.040
F1: 0.073


In [13]:
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))

TP: 6.000
TN: 868.000
FP: 8.000
FN: 145.000


In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [16]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [17]:
print("Accuracy: {:.3f}".format(acc))
print("Precision: {:.3f}".format(prec))
print("Recall: {:.3f}".format(rec))
print("F1: {:.3f}".format(f1))

Accuracy: 0.852
Precision: 0.467
Recall: 0.046
F1: 0.084


In [18]:
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))

TP: 7.000
TN: 868.000
FP: 8.000
FN: 144.000
