In [1]:
from utils import prepare_dataframe, load_embeddings_from_directory_for_df
import time
import csv
import pandas as pd
import torch
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Veri hazırlığı
embedding_directory = 'embeddings/GCB_Pooler_embeddings_patch_before_largewithcodedata_Xlimit3label'
df = prepare_dataframe("merged.csv", False, False)
resulting_label_count = df['labels'].nunique()

# Sadece apache repoları seçiliyor
df = df[df['full_repo_name'].str.startswith('apache', na=False)]

# Label frekanslarını görüntüleme
label_counts = df['labels'].value_counts()
print(label_counts)

# Test için veri setini dengeleme
label_enh = df[df['labels'] == 'enhancement']
label_bug = df[df['labels'] == 'bug']
label_cln = df[df['labels'] == 'clean']

min_samples = min(len(label_enh), len(label_bug), len(label_cln))
sampled_enh = label_enh.sample(n=min_samples, random_state=1)
sampled_bug = label_bug.sample(n=min_samples, random_state=1)
sampled_cln = label_cln.sample(n=min_samples, random_state=1)
df = pd.concat([sampled_enh, sampled_bug, sampled_cln])

# One-hot encoding işlemi
df_encoded_labels = pd.get_dummies(df, columns=['labels'])
df_encoded_repo = pd.get_dummies(df, columns=['full_repo_name'])

# Özellikler ve etiketler
X_repo = df_encoded_repo.drop(columns=['labels', 'patch', 'index']).astype(int)
y = df_encoded_labels.drop(columns=['patch', 'full_repo_name', 'index']).astype(int)

# Tüm embedding ve repo_name_tensor verilerini yükleyip birleştirme
df['label_tensor'] = [torch.tensor(row, dtype=torch.float32) for row in y.values]
df['repo_name_tensor'] = [torch.tensor(row, dtype=torch.float32) for row in X_repo.values]
df = df.drop(columns=['full_repo_name', 'labels', 'patch'])

# Embedding'leri yükleme
df = load_embeddings_from_directory_for_df(directory_path=embedding_directory, df=df)
df['embedding'] = df['embedding'].apply(lambda x: x.numpy() if isinstance(x, torch.Tensor) else x)
df['repo_name_tensor'] = df['repo_name_tensor'].apply(lambda x: x.numpy() if isinstance(x, torch.Tensor) else x)

# Özellikleri birleştirme ve normalize etme
X = np.vstack(df.apply(lambda row: np.concatenate([row['embedding'], row['repo_name_tensor']]), axis=1))
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Label tensor'leri argmax ile sınıf etiketine çevirme
y = df['label_tensor'].apply(lambda x: x.numpy().argmax() if isinstance(x, torch.Tensor) else x).values

# Veriyi eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Modelleri tanımlıyoruz
models = [
    ("XGBoost", xgb.XGBClassifier(objective="multi:softmax", num_class=3, eval_metric="mlogloss")),
    ("RandomForest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("CatBoost", CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, verbose=0)),
    ("LightGBM", LGBMClassifier(num_class=3)),
    ("AdaBoost", AdaBoostClassifier(n_estimators=100, random_state=42)),
    ("SVM", SVC(decision_function_shape='ovo', kernel='linear', random_state=42)),
    ("KNN", KNeighborsClassifier(n_neighbors=5)),
    ("NaiveBayes", GaussianNB()),
    ("LogisticRegression", LogisticRegression(max_iter=1000, random_state=42))
]

# Modellerin doğruluk, precision, recall ve f1 skorlarını saklamak için bir sözlük
metrics = {}

# Modelleri eğitip tahmin sürelerini hesapla
for model_name, model in models:
    # Modeli eğit
    model.fit(X_train, y_train)

    # Tahmin süresini ölç
    start_time = time.time()
    y_pred = model.predict(X_test)  # Tahmin yap
    end_time = time.time()
    prediction_time = end_time - start_time  # Tahmin süresi

    # Metrikleri hesapla
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Metrikleri ve tahmin süresini sakla
    metrics[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Prediction Time (s)": prediction_time
    }

    # Sonuçları yazdır
    print(f"Metrics for {model_name}:")
    print(f"  Accuracy: {accuracy}")
    print(f"  Precision: {precision}")
    print(f"  Recall: {recall}")
    print(f"  F1 Score: {f1}")
    print(f"  Prediction Time (s): {prediction_time:.4f}")

# Tüm sonuçları görüntüleme
print("\nAll Model Metrics:")
for model_name, model_metrics in metrics.items():
    print(f"{model_name}: {model_metrics}")
    

# Metrikleri CSV dosyasına kaydetme
output_file = "model_metrics.csv"
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Başlıkları yaz
    writer.writerow(["Model Name", "Accuracy", "Precision", "Recall", "F1 Score", "Prediction Time (s)"])
    # Her modelin metriklerini yaz
    for model_name, model_metrics in metrics.items():
        writer.writerow([
            model_name,
            model_metrics["Accuracy"],
            model_metrics["Precision"],
            model_metrics["Recall"],
            model_metrics["F1 Score"],
            model_metrics["Prediction Time (s)"]
        ])

print(f"\nMetrics saved to {output_file}")

labels
enhancement    37586
bug            32864
clean          17263
performance     5392
refactor        4596
Name: count, dtype: int64
labels
enhancement    37586
bug            32864
clean          17263
Name: count, dtype: int64
labels
bug            17934
enhancement    16113
clean          15336
Name: count, dtype: int64


  0%|          | 0/46008 [00:00<?, ?it/s]

Metrics for XGBoost:
  Accuracy: 0.8016735492284286
  Precision: 0.8031687857152253
  Recall: 0.8002905411588283
  F1 Score: 0.8008573382120886
Metrics for RandomForest:
  Accuracy: 0.7663551401869159
  Precision: 0.770280482587046
  Recall: 0.7653462344278464
  F1 Score: 0.7668120306411925
Metrics for CatBoost:
  Accuracy: 0.7475548793740491
  Precision: 0.7582962695078966
  Recall: 0.7453700971733247
  F1 Score: 0.745897571672729
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195990
[LightGBM] [Info] Number of data points in the train set: 36806, number of used features: 818
[LightGBM] [Info] Start training from score -1.091762
[LightGBM] [Info] Start training from score -1.103487
[LightGBM] [Info] Start training from score -1.100625
Metrics for LightGBM:
  Accuracy: 0.8010215170615084
  Precision: 0.8036682091889573
  Recall: 0.7993068517771

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import pickle

# Eğitilmiş modeller ile confusion matrix'i sonradan hesaplama ve görselleştirme
confusion_matrices = {}
for model_name, model in models.items():
    y_pred = model.predict(X_test)  # Test verisi üzerinde tahmin yap
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices[model_name] = cm  # Confusion matrix'i sakla

# Confusion matrix'leri görselleştirme
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15))
fig.suptitle("Confusion Matrices for Trained Models", fontsize=20)

for ax, (model_name, cm) in zip(axes.flatten(), confusion_matrices.items()):
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax)
    ax.set_title(f"{model_name} Confusion Matrix")
    ax.set_xlabel("Predicted Label")
    ax.set_ylabel("True Label")

plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to fit titles
plt.show()

AttributeError: 'list' object has no attribute 'items'