In [1]:
from lib.database.database_connector import DatabaseConnector
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from tqdm import tqdm


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cagatay/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
database_connector = DatabaseConnector("./data/database.db")
cursor = database_connector.cursor

split_date = "2020-01-01"

In [22]:
train_data = cursor.execute(
    "SELECT sentences, label FROM embeddings WHERE filing_date < '2019-01-01';"
).fetchall()

test_data = cursor.execute(
    "SELECT sentences, label FROM embeddings WHERE filing_date >= '2020-01-01';"
).fetchall()

In [23]:
train_data = [(json.loads(sentences), label) for sentences, label in train_data]
test_data = [(json.loads(sentences), label) for sentences, label in test_data]

In [24]:
train_data = [(" ".join(texts), label) for texts, label in train_data]
test_data = [(" ".join(texts), label) for texts, label in test_data]

In [25]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    # Küçük harf, sadece harf ve boşluk
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words and len(t)>2]
    return ' '.join(tokens)

In [26]:
train_data = [(preprocess(text), label) for text, label in train_data]
test_data = [(preprocess(text), label) for text, label in test_data]

In [27]:
# 5. Bag-of-Words özelliklerini çıkarma
vectorizer = CountVectorizer(
    max_features=768,       # en sık 768 kelimeyi al
    ngram_range=(1,2),       # unigram + bigram
    min_df=5,                # en az 5 dokümanda geçsin
    max_df=0.8               # en fazla %80 dokümanda geçsin
)
X_train = vectorizer.fit_transform([text for text, label in train_data])
y_train = np.array([        label for text, label in train_data])
X_test  = vectorizer.transform([text for text, label in test_data])
y_test  = np.array([        label for text, label in test_data])

In [30]:
# ---- Logistic Regression ----
lr = LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr   = lr.predict(X_test)
y_proba_lr  = lr.predict_proba(X_test)[:,1]

print("=== Logistic Regression ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_lr))
print("Precision:      ", precision_score(y_test, y_pred_lr))
print("Recall:         ", recall_score(y_test, y_pred_lr))
print("F1-score:       ", f1_score(y_test, y_pred_lr))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lr).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))

=== Logistic Regression ===
Accuracy:        0.7702044790652386
Precision:       0.1897810218978102
Recall:          0.17218543046357615
F1-score:        0.18055555555555555
TP: 26.000
TN: 765.000
FP: 111.000
FN: 125.000


In [42]:
# feature isimleri ve katsayılar
feature_names = vectorizer.get_feature_names_out()
coefs = lr.coef_[0]

# Pozitif sınıf lehine en güçlü 10 n-gram
top_pos_idx = np.argsort(coefs)[-10:][::-1]
print("Top 10 pozitife dönen n-gram’lar (label=1 lehine):")
for i in top_pos_idx:
    print(f"  {feature_names[i]:<20} coef = {coefs[i]:.4f}")

print()

# Negatif sınıf lehine en güçlü 10 n-gram
top_neg_idx = np.argsort(coefs)[:10]
print("Top 10 negatife dönen n-gram’lar (label=0 lehine):")
for i in top_neg_idx:
    print(f"  {feature_names[i]:<20} coef = {coefs[i]:.4f}")

Top 10 pozitife dönen n-gram’lar (label=1 lehine):
  balance sheet        coef = 0.2551
  estate               coef = 0.2311
  actual results       coef = 0.2290
  weighted average     coef = 0.1677
  forward looking      coef = 0.1587
  directors            coef = 0.1556
  revolving credit     coef = 0.1434
  discontinued         coef = 0.1338
  equivalents          coef = 0.1230
  next                 coef = 0.1202

Top 10 negatife dönen n-gram’lar (label=0 lehine):
  sheet                coef = -0.2445
  real estate          coef = -0.2337
  board directors      coef = -0.2184
  looking              coef = -0.1659
  numbers              coef = -0.1627
  weighted             coef = -0.1518
  flow                 coef = -0.1478
  working capital      coef = -0.1465
  discontinued operations coef = -0.1331
  characters           coef = -0.1308


In [37]:
rf = RandomForestClassifier(
    n_estimators=10,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf  = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print("=== Random Forest ===")
print("Accuracy:       ", accuracy_score(y_test, y_pred_rf))
print("Precision:      ", precision_score(y_test, y_pred_rf))
print("Recall:         ", recall_score(y_test, y_pred_rf))
print("F1-score:       ", f1_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

=== Random Forest ===
Accuracy:        0.8422590068159689
Precision:       0.28
Recall:          0.046357615894039736
F1-score:        0.07954545454545454
Confusion Matrix:
 [[858  18]
 [144   7]]
