In [18]:
from lib.database.database_connector import DatabaseConnector
import json
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cagatay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
database_connector = DatabaseConnector("./data/database.db")
cursor = database_connector.cursor

In [3]:
train_data = cursor.execute(
    "SELECT sentences, embeddings, label FROM embeddings WHERE filing_date < '2019-01-01';"
).fetchall()

train_data = [(json.loads(sentences), np.array(json.loads(embeddings), dtype=np.float32), label)
              for sentences, embeddings, label in tqdm(train_data, desc="Training")]

test_data = cursor.execute(
    "SELECT embeddings, label FROM embeddings WHERE filing_date >= '2020-01-01';"
).fetchall()

test_data = [(np.array(json.loads(embeddings), dtype=np.float32), label)
             for embeddings, label in tqdm(test_data, desc="Testing")]

Training: 100%|██████████| 6959/6959 [02:38<00:00, 44.03it/s]
Testing: 100%|██████████| 1027/1027 [00:20<00:00, 50.72it/s]


In [4]:
train_flatten = [(sentence, x, y) for sentences_list, x_list, y in train_data for sentence, x in zip(sentences_list, x_list)]
sentences, X_train, y_train = zip(*train_flatten)
X_train = np.array(X_train)
y_train = np.array(y_train)

In [5]:
del train_data

In [6]:
lda = LinearDiscriminantAnalysis(n_components=1, solver='eigen')  # veya 'eigen'
X_proj = lda.fit_transform(X_train, y_train)

In [7]:
proj_score = X_proj  # LDA projection

In [8]:
# Skorları sınıfa göre sıralayıp en ayırt edici 10'ar cümleyi seç
class_0_idx = np.where(y_train == 0)[0]
class_1_idx = np.where(y_train == 1)[0]

# Negatif sınıf cümlelerinde en negatif projeksiyonu olanlar
top_10_neg = sorted(class_0_idx, key=lambda i: proj_score[i])[:10]
top_10_pos = sorted(class_1_idx, key=lambda i: -proj_score[i])[:10]

print("\nTop 10 Discriminative Sentences for Class 0:")
for i in top_10_neg:
    print(f"[Score={float(proj_score[i]):.4f}] {sentences[i]}")

print("\nTop 10 Discriminative Sentences for Class 1:")
for i in top_10_pos:
    print(f"[Score={float(proj_score[i]):.4f}] {sentences[i]}")



Top 10 Discriminative Sentences for Class 0:
[Score=10428.7136] The increase in service revenues in 2012 was primarily driven by strong contract renewals compared to 2011 for certain edge routing, switching and security products.The increase in product revenues in 2011, compared to 2010, was primarily due to an increase in sales of our edge routing and switching products, partially offset by decreases in core routing and high-end firewall products.
[Score=10428.7448] Infrastructure Platforms revenue increased by 2%, or $491 million, with strength across the portfolio with the exception of routing.
[Score=10429.0426] This decrease was partially offset by growth in Strategic services revenues, primarily due to growth in advanced services, such as IP communications, our cloud and data center offerings, contact center solutions, security services and professional services as well as our telematics offerings.
[Score=10429.2064] On Demand: On Demand includes our Oracle On Demand and Advance

  print(f"[Score={float(proj_score[i]):.4f}] {sentences[i]}")
  print(f"[Score={float(proj_score[i]):.4f}] {sentences[i]}")


In [9]:
N = 100_000

top_n_neg = sorted(class_0_idx, key=lambda i: proj_score[i])[:N]
top_n_pos = sorted(class_1_idx, key=lambda i: -proj_score[i])[:N]

# Seçilen en ayırt edici embedding'leri al
X_selected = np.vstack([X_train[top_n_pos], X_train[top_n_neg]])
y_selected = np.array([1]*N + [0]*N)

In [10]:
len(X_train), len(X_selected)

(2201613, 200000)

In [19]:
# Basit bir logistic regression ile sınıflandırıcı kur
# clf = LogisticRegression()
# clf = KNeighborsClassifier(n_neighbors=5)
clf = RandomForestClassifier()
clf.fit(X_selected, y_selected)

In [22]:
y_true = []
y_pred = []

for vec_list, label in tqdm(test_data, desc="Testing"):
    vec_list = np.array(vec_list)

    # Her vektör için probability tahmini: shape = [n_vectors, n_classes]
    probs = clf.predict_proba(vec_list)

    # Sınıf 1 (pozitif) olasılıklarının ortalaması
    mean_prob = np.mean(probs, axis=0)  # shape: [2]

    # En yüksek ortalama olasılığı olan sınıfı seç
    pred_label = np.argmax(mean_prob)

    y_true.append(label)
    y_pred.append(pred_label)


Testing: 100%|██████████| 1027/1027 [00:12<00:00, 83.77it/s]


In [23]:
print("=== Classifier ===")
print("Accuracy:       ", accuracy_score(y_true, y_pred))
print("Precision:      ", precision_score(y_true, y_pred))
print("Recall:         ", recall_score(y_true, y_pred))
print("F1-score:       ", f1_score(y_true, y_pred))
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print("TP: {:.3f}".format(tp))
print("TN: {:.3f}".format(tn))
print("FP: {:.3f}".format(fp))
print("FN: {:.3f}".format(fn))

=== Classifier ===
Accuracy:        0.19961051606621227
Precision:       0.1411764705882353
Recall:          0.8741721854304636
F1-score:        0.2430939226519337
TP: 132.000
TN: 73.000
FP: 803.000
FN: 19.000


In [None]:
# RF resılts
#
# === Classifier ===
# Accuracy:        0.19961051606621227
# Precision:       0.1411764705882353
# Recall:          0.8741721854304636
# F1-score:        0.2430939226519337
# TP: 132.000
# TN: 73.000
# FP: 803.000
# FN: 19.000