In [1]:
from nltk.tokenize import sent_tokenize

from lib.util.custom_data_loader import CustomDataset
from lib.model.graph_classifer import GraphClassifier
from lib.model.sentence_classifer import SentenceClassifier
from lib.config.config_loader import ConfigLoader


import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.loader import DataLoader
from torch.nn.utils.rnn import pad_sequence

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm

config = ConfigLoader().load_config()
tqdm.pandas()

In [2]:
input_dim = config['models']['input_dim']
hidden_dim_1 = config['models']['hidden_dim_1']
hidden_dim_2 = config['models']['hidden_dim_2']
hidden_dim_3 = config['models']['hidden_dim_3']

In [3]:
df = pd.read_csv('./data/processed/reports_labeled.csv')
#df = df[df['form'] == '10-K'].copy()
df.reset_index(drop=True, inplace=True)
df['sentences'] = df['mda'].progress_apply(lambda x: sent_tokenize(x))

train_df = df[df['year'] <= 2019].copy()
test_df = df[df['year'] >= 2019].copy()

100%|██████████| 13017/13017 [01:10<00:00, 184.79it/s]


In [None]:
train_corpus = [sentence for sentence in train_df['mda']]

vectorizer = TfidfVectorizer(max_features=input_dim, stop_words='english')
vectorizer.fit(train_corpus)

def get_tfidf_embeddings(sentence_list):
    if not type(sentence_list) == list:
        sentence_list = [sentence_list]
    embeddings = vectorizer.transform(sentence_list)
    return embeddings

print("Train Sentence: ")
train_df['tfidf_sentence'] = train_df['sentences'].progress_apply(get_tfidf_embeddings)
print("Test Sentence: ")
test_df['tfidf_sentence'] = test_df['sentences'].progress_apply(get_tfidf_embeddings)

print("Train MDA: ")
train_df['tfidf_mda'] = train_df['mda'].progress_apply(get_tfidf_embeddings)
print("Test MDA: ")
test_df['tfidf_mda'] = test_df['mda'].progress_apply(get_tfidf_embeddings)


Train Sentence: 


 59%|█████▉    | 6499/11019 [00:49<00:32, 140.96it/s]

In [5]:
print(f"""
Length of training set: {len(train_df)}
Length of test set: {len(test_df)}
""")


Length of training set: 11019
Length of test set: 3326



In [6]:
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

In [7]:
def custom_collate(batch):
    # Batch içindeki tensor'ları ve etiketleri ayırın
    tensors = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    # pad_sequence ile tüm tensor'ları aynı uzunlukta olacak şekilde doldurun (padding_value isteğe bağlı)
    padded_tensors = pad_sequence(tensors, batch_first=True, padding_value=0)
    return padded_tensors, torch.tensor(labels)

In [8]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_collate)
# test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=custom_collate)


In [9]:
# Model, optimizer ve loss function tanımlanıyor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = GraphClassifier(input_dim=input_dim, hidden_dim_1=hidden_dim_1, hidden_dim_2=hidden_dim_2, lstm_hidden_dim=hidden_dim_3, output_dim=2).to(device)
model = SentenceClassifier(input_dim=input_dim, lstm_hidden_dim=hidden_dim_1, fc1_dim=hidden_dim_2, fc2_dim=hidden_dim_3, output_dim=2).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0002)
criterion = nn.CrossEntropyLoss()

In [10]:
def evaluate(y_true, y_pred):
    accuracy = round(accuracy_score(y_true, y_pred), 4)
    precision = round(precision_score(y_true, y_pred, zero_division=0), 4)
    recall = round(recall_score(y_true, y_pred, zero_division=0), 4)
    f1 = round(f1_score(y_true, y_pred, zero_division=0), 4)

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    return accuracy, precision, recall, f1, tp, tn, fp, fn

def train():
    model.train()
    total_loss = 0
    # train_loader üzerinden geçerken progress bar ekleniyor.
    for data, label in tqdm(train_loader, desc="Training", leave=False):
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)  # Modelin çıktısı (logits)
        loss = criterion(out, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Test fonksiyonu
def test(loader):
    model.eval()
    correct = 0

    y_pred = []
    y_true = []

    for data, label in tqdm(loader, desc="Testing", leave=False):
        data = data.to(device)
        with torch.no_grad():
            out = model(data)
            pred = out.argmax(dim=1)  # En yüksek logit değerine sahip sınıf

            y_true.extend(label.cpu().numpy())
            y_pred.extend(pred.cpu().numpy())
    return y_true, y_pred

In [11]:
x = iter(train_loader)
x, y = next(x)

TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not numpy.ndarray

In [None]:
# Eğitim döngüsü
num_epochs = 10
for epoch in range(1, num_epochs + 1):
    loss = train()
    y_true_test, y_pred_test  = test(test_loader)
    accuracy, precision, recall, f1, tp, tn, fp, fn = evaluate(y_true_test, y_pred_test)

    # if epoch % 5 == 0 or epoch == num_epochs or epoch == 1:
    print(f"Epoch: {epoch:02d}, Loss: {loss:.4f} | Test | Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1} | TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")

In [None]:
result_dict = {
    "Accuracy": [accuracy],
    "Precision": [precision],
    "Recall": [recall],
    "F1": [f1],
    "TP": [tp],
    "TN": [tn],
    "FP": [fp],
    "FN": [fn]
}

result_df = pd.DataFrame(data=result_dict)
result_df.to_csv('./output/SentenceClassifier.csv', index=False)

In [None]:
torch.save(model.state_dict(), f'./output/SentenceClassifier.pth')