In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.utils import simple_preprocess
from nltk.tokenize import word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Veri setini yükleyin (örnek: emails.csv)
df = pd.read_csv('Data/enron4.csv')  # CSV dosyasını oku

# Eğitim, doğrulama ve test setlerine ayırma
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)


In [5]:
def distil_bert_embedding(data: list):
    print("distil-bert-running")
    embedder = SentenceTransformer("distilbert-base-nli-mean-tokens")
    res = embedder.encode(data)
    res = [i for  i in res] 
    return res

In [6]:
# TF-IDF vektörleştiricisini oluştur
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # 5000 kelimeye kadar vektörleştirme
X_train = distil_bert_embedding(train_texts.tolist())
X_val = distil_bert_embedding(val_texts.tolist())
X_test = distil_bert_embedding(test_texts.tolist())


y_train_label = distil_bert_embedding(train_labels.tolist())
y_val_label = distil_bert_embedding(val_labels.tolist())
y_test_label = distil_bert_embedding(test_labels.tolist())


distil-bert-running
distil-bert-running
distil-bert-running
distil-bert-running
distil-bert-running
distil-bert-running


In [7]:
class ANNModel(nn.Module):
    def __init__(self, input_dim):
        super(ANNModel, self).__init__()
        
        # Gizli katmanlar
        self.fc1 = nn.Linear(input_dim, 512)  
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.25)  

        self.fc2 = nn.Linear(512, 256) 
        self.relu2 = nn.ReLU()

        self.fc3 = nn.Linear(256, 64)  
        self.relu3 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.25) 

        self.fc4 = nn.Linear(64, 16) 
        self.relu4 = nn.ReLU()

        self.fc5 = nn.Linear(16, 1)  
        self.sigmoid = nn.Sigmoid()  

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout2(x)
        x = self.fc4(x)
        x = self.relu4(x)
        x = self.fc5(x)
        x = self.sigmoid(x)
        return x


In [8]:
# One-hot encoded hedefleri düz hale getirin
y_train_label = np.argmax(y_train_label, axis=1)
y_val_label = np.argmax(y_val_label, axis=1)
y_test_label = np.argmax(y_test_label, axis=1)


In [9]:
X_train = torch.tensor(X_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

y_train_label = torch.tensor(y_train_label, dtype=torch.float32)
y_val_label = torch.tensor(y_val_label, dtype=torch.float32)
y_test_label = torch.tensor(y_test_label, dtype=torch.float32)

  X_train = torch.tensor(X_train, dtype=torch.float32)


In [10]:
# PyTorch TensorDataset oluştur
train_data = TensorDataset(X_train, y_train_label)
val_data = TensorDataset(X_val, y_val_label)
test_data = TensorDataset(X_test, y_test_label)

# DataLoader ile batch işlemi
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


In [11]:
import torch
import torch.nn as nn
import torch.optim as optim

# Cihazı belirle (GPU var mı kontrol et)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Modeli oluştur ve GPU'ya taşı
model = ANNModel(input_dim=X_train.shape[1]).to(device)

# Loss ve optimizer
criterion = nn.CrossEntropyLoss()  # Binary Cross-Entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Eğitim fonksiyonu
def train_model(model, train_loader, val_loader, epochs=10):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct_preds = 0
        total_preds = 0

        for inputs, labels in train_loader:
            # Verileri GPU'ya taşı
            inputs, labels = inputs.to(device).float(), labels.to(device).float()
            
            if labels.dim() == 0:  # Eğer boyutsuzsa
                labels = labels.unsqueeze(0)
            labels = labels.view(-1)  # Etiketlerin boyutunu düzleştir

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)  # Loss hesapla
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            predicted = (outputs > 0.5).float()
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

        train_accuracy = correct_preds / total_preds
        avg_loss = running_loss / len(train_loader)

        # Validation
        model.eval()
        val_correct_preds = 0
        val_total_preds = 0
        val_running_loss = 0.0

        with torch.no_grad():
            for inputs, labels in val_loader:
                # Verileri GPU'ya taşı
                inputs, labels = inputs.to(device).float(), labels.to(device).float()
                labels = labels.view(-1)  # Etiketlerin boyutunu düzleştir

                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_running_loss += loss.item()

                predicted = (outputs > 0.5).float()
                val_correct_preds += (predicted == labels).sum().item()
                val_total_preds += labels.size(0)

        val_accuracy = val_correct_preds / val_total_preds
        val_avg_loss = val_running_loss / len(val_loader)

        print(f"Epoch [{epoch+1}/{epochs}], "
              f"Train Loss: {avg_loss:.4f}, Train Acc: {train_accuracy*100:.2f}%, "
              f"Val Loss: {val_avg_loss:.4f}, Val Acc: {val_accuracy*100:.2f}%")

# Eğitim başlat
train_model(model, train_data, val_data)


Epoch [1/10], Train Loss: 0.0000, Train Acc: 0.00%, Val Loss: 0.0000, Val Acc: 0.00%
Epoch [2/10], Train Loss: 0.0000, Train Acc: 0.00%, Val Loss: 0.0000, Val Acc: 0.00%
Epoch [3/10], Train Loss: 0.0000, Train Acc: 0.00%, Val Loss: 0.0000, Val Acc: 0.00%
Epoch [4/10], Train Loss: 0.0000, Train Acc: 0.00%, Val Loss: 0.0000, Val Acc: 0.00%
Epoch [5/10], Train Loss: 0.0000, Train Acc: 0.00%, Val Loss: 0.0000, Val Acc: 0.00%
Epoch [6/10], Train Loss: 0.0000, Train Acc: 0.00%, Val Loss: 0.0000, Val Acc: 0.00%
Epoch [7/10], Train Loss: 0.0000, Train Acc: 0.00%, Val Loss: 0.0000, Val Acc: 0.00%
Epoch [8/10], Train Loss: 0.0000, Train Acc: 0.00%, Val Loss: 0.0000, Val Acc: 0.00%
Epoch [9/10], Train Loss: 0.0000, Train Acc: 0.00%, Val Loss: 0.0000, Val Acc: 0.00%
Epoch [10/10], Train Loss: 0.0000, Train Acc: 0.00%, Val Loss: 0.0000, Val Acc: 0.00%


In [13]:
import torch
from sklearn.metrics import classification_report

# Cihazı belirle (GPU var mı kontrol et)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Test fonksiyonu
def evaluate_model(model, test_loader):
    model.eval()  # Modeli değerlendirme moduna al
    correct_preds = 0
    total_preds = 0

    # Modeli GPU'ya taşı
    model = model.to(device)

    with torch.no_grad():
        for inputs, labels in test_loader:
            # Verileri GPU'ya taşı
            inputs, labels = inputs.to(device).float(), labels.to(device).float()

            # Modelin çıktısını al
            outputs = model(inputs).squeeze()

            # Tahminler
            predicted = (outputs > 0.5).float()
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

    test_accuracy = correct_preds / total_preds
    print(f"Test Accuracy: {test_accuracy*100:.2f}%")

    # Sınıflandırma raporu
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            # Verileri GPU'ya taşı
            inputs, labels = inputs.to(device).float(), labels.to(device).float()

            # Modelin çıktısını al
            outputs = model(inputs).squeeze()

            # Tahminleri topla
            predicted = (outputs > 0.5).float()
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    print("\nTest Classification Report:")
    print(classification_report(y_true, y_pred))

# Test üzerinde modelin başarısını ölç
evaluate_model(model, test_loader)


Test Accuracy: 0.00%

Test Classification Report:
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00       0.0
        70.0       0.00      0.00      0.00     448.0
       445.0       0.00      0.00      0.00     152.0

    accuracy                           0.00     600.0
   macro avg       0.00      0.00      0.00     600.0
weighted avg       0.00      0.00      0.00     600.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# Test üzerinde modelin başarısını ölç
evaluate_model(model, test_loader)


Test Accuracy: 0.00%

Test Classification Report:
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00       0.0
        70.0       0.00      0.00      0.00     448.0
       445.0       0.00      0.00      0.00     152.0

    accuracy                           0.00     600.0
   macro avg       0.00      0.00      0.00     600.0
weighted avg       0.00      0.00      0.00     600.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
torch.save(model.state_dict(), "model_weights_larger.pth")