In [1]:
from lib.database.database_connector import DatabaseConnector
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from tqdm import tqdm


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cagatay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
database_connector = DatabaseConnector("./data/database.db")
cursor = database_connector.cursor

split_date = "2020-01-01"

In [3]:
train_data = cursor.execute(
    "SELECT sentences, label FROM embeddings WHERE filing_date < '2020-01-01';"
).fetchall()

test_data = cursor.execute(
    "SELECT sentences, label FROM embeddings WHERE filing_date >= '2020-01-01';"
).fetchall()

In [4]:
train_data = [(json.loads(sentences), label) for sentences, label in train_data]
test_data = [(json.loads(sentences), label) for sentences, label in test_data]

In [5]:
train_data = [(" ".join(texts), label) for texts, label in train_data]
test_data = [(" ".join(texts), label) for texts, label in test_data]

In [6]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    # Küçük harf, sadece harf ve boşluk
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words and len(t)>2]
    return ' '.join(tokens)

In [7]:
train_data = [(preprocess(text), label) for text, label in train_data]
test_data = [(preprocess(text), label) for text, label in test_data]

In [8]:
# 5. Bag-of-Words özelliklerini çıkarma
vectorizer = CountVectorizer(
    max_features=768,       # en sık 768 kelimeyi al
    ngram_range=(1,2),       # unigram + bigram
    min_df=5,                # en az 5 dokümanda geçsin
    max_df=0.8               # en fazla %80 dokümanda geçsin
)
X_train = vectorizer.fit_transform([text for text, label in train_data])
y_train = np.array([        label for text, label in train_data])
X_test  = vectorizer.transform([text for text, label in test_data])
y_test  = np.array([        label for text, label in test_data])

In [9]:
# ---- Logistic Regression ----
lr = LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr   = lr.predict(X_test)
y_proba_lr  = lr.predict_proba(X_test)[:,1]

print("=== Logistic Regression ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_lr))
print("Precision:      ", precision_score(y_test, y_pred_lr))
print("Recall:         ", recall_score(y_test, y_pred_lr))
print("F1-score:       ", f1_score(y_test, y_pred_lr))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lr).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))

=== Logistic Regression ===
Accuracy:        0.7964946445959105
Precision:       0.23148148148148148
Recall:          0.16556291390728478
F1-score:        0.19305019305019305
TP: 25.000
TN: 793.000
FP: 83.000
FN: 126.000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# feature isimleri ve katsayılar
feature_names = vectorizer.get_feature_names_out()
coefs = lr.coef_[0]

# Pozitif sınıf lehine en güçlü 10 n-gram
top_pos_idx = np.argsort(coefs)[-10:][::-1]
print("Top 10 pozitife dönen n-gram’lar (label=1 lehine):")
for i in top_pos_idx:
    print(f"  {feature_names[i]:<20} coef = {coefs[i]:.4f}")

print()

# Negatif sınıf lehine en güçlü 10 n-gram
top_neg_idx = np.argsort(coefs)[:10]
print("Top 10 negatife dönen n-gram’lar (label=0 lehine):")
for i in top_neg_idx:
    print(f"  {feature_names[i]:<20} coef = {coefs[i]:.4f}")

Top 10 pozitife dönen n-gram’lar (label=1 lehine):
  balance sheet        coef = 0.2770
  estate               coef = 0.1999
  weighted average     coef = 0.1892
  actual results       coef = 0.1867
  equivalents          coef = 0.1647
  either               coef = 0.1369
  directors            coef = 0.1125
  cash flow            coef = 0.1073
  forward looking      coef = 0.0936
  next                 coef = 0.0930

Top 10 negatife dönen n-gram’lar (label=0 lehine):
  sheet                coef = -0.2658
  real estate          coef = -0.2371
  weighted             coef = -0.1636
  board directors      coef = -0.1508
  cash equivalents     coef = -0.1423
  characters           coef = -0.1355
  numbers              coef = -0.1281
  looking              coef = -0.1255
  flow                 coef = -0.1254
  accrued              coef = -0.1013


In [11]:
rf = RandomForestClassifier(
    n_estimators=10,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf  = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print("=== Random Forest Regression ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_rf))
print("Precision:      ", precision_score(y_test, y_pred_rf))
print("Recall:         ", recall_score(y_test, y_pred_rf))
print("F1-score:       ", f1_score(y_test, y_pred_rf))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))

=== Random Forest Regression ===
Accuracy:        0.8364167478091529
Precision:       0.2571428571428571
Recall:          0.059602649006622516
F1-score:        0.0967741935483871
TP: 9.000
TN: 850.000
FP: 26.000
FN: 142.000


In [12]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
y_proba_knn = knn.predict_proba(X_test)[:,1]

print("=== KNN Regression ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_knn))
print("Precision:      ", precision_score(y_test, y_pred_knn))
print("Recall:         ", recall_score(y_test, y_pred_knn))
print("F1-score:       ", f1_score(y_test, y_pred_knn))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_knn).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))


=== KNN Regression ===
Accuracy:        0.7867575462512172
Precision:       0.19090909090909092
Recall:          0.1390728476821192
F1-score:        0.16091954022988506
TP: 21.000
TN: 787.000
FP: 89.000
FN: 130.000
