In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, f1_score, make_scorer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline

In [103]:
folder_path = "dataPROMISE/"
files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
ant_files = sorted([f for f in files if f.startswith('')])
file_list = os.listdir(folder_path)
print(file_list)

['ant-1.3.csv', 'ant-1.4.csv', 'ant-1.5.csv', 'ant-1.6.csv', 'ant-1.7.csv', 'arc.csv', 'camel-1.0.csv', 'camel-1.2.csv', 'camel-1.4.csv', 'camel-1.6.csv', 'ivy-1.0.csv', 'ivy-1.1.csv', 'ivy-1.2.csv', 'jedit-3.2.csv', 'jedit-4.0.csv', 'jedit-4.1.csv', 'jedit-4.2.csv', 'jedit-4.3.csv', 'log4j-1.0.csv', 'log4j-1.1.csv', 'log4j-1.2.csv', 'lucene-2.0.csv', 'lucene-2.2.csv', 'lucene-2.4.csv', 'poi-1.5.csv', 'poi-2.0.csv', 'poi-2.5.csv', 'poi-3.0.csv', 'prop-6.csv', 'redaktor.csv', 'synapse-1.0.csv', 'synapse-1.1.csv', 'synapse-1.2.csv', 'tomcat.csv', 'velocity-1.4.csv', 'velocity-1.5.csv', 'velocity-1.6.csv', 'xalan-2.4.csv', 'xalan-2.5.csv', 'xalan-2.6.csv', 'xalan-2.7.csv', 'xerces-1.1.csv', 'xerces-1.2.csv', 'xerces-1.3.csv', 'xerces-1.4.4.csv']


In [104]:
projects = {}
for file in file_list:
    project_name = "-".join(file.split("-")[:-1])
    if project_name == '':
        project_name = file.split('.')[0] 
    if project_name not in projects:
        projects[project_name] = []
    projects[project_name].append(file)
print(projects)

{'ant': ['ant-1.3.csv', 'ant-1.4.csv', 'ant-1.5.csv', 'ant-1.6.csv', 'ant-1.7.csv'], 'arc': ['arc.csv'], 'camel': ['camel-1.0.csv', 'camel-1.2.csv', 'camel-1.4.csv', 'camel-1.6.csv'], 'ivy': ['ivy-1.0.csv', 'ivy-1.1.csv', 'ivy-1.2.csv'], 'jedit': ['jedit-3.2.csv', 'jedit-4.0.csv', 'jedit-4.1.csv', 'jedit-4.2.csv', 'jedit-4.3.csv'], 'log4j': ['log4j-1.0.csv', 'log4j-1.1.csv', 'log4j-1.2.csv'], 'lucene': ['lucene-2.0.csv', 'lucene-2.2.csv', 'lucene-2.4.csv'], 'poi': ['poi-1.5.csv', 'poi-2.0.csv', 'poi-2.5.csv', 'poi-3.0.csv'], 'prop': ['prop-6.csv'], 'redaktor': ['redaktor.csv'], 'synapse': ['synapse-1.0.csv', 'synapse-1.1.csv', 'synapse-1.2.csv'], 'tomcat': ['tomcat.csv'], 'velocity': ['velocity-1.4.csv', 'velocity-1.5.csv', 'velocity-1.6.csv'], 'xalan': ['xalan-2.4.csv', 'xalan-2.5.csv', 'xalan-2.6.csv', 'xalan-2.7.csv'], 'xerces': ['xerces-1.1.csv', 'xerces-1.2.csv', 'xerces-1.3.csv', 'xerces-1.4.4.csv']}


In [105]:
data = pd.read_csv(os.path.join(folder_path, file_list[0]))
data.columns

Index(['name', 'wmc', 'dit', 'noc', 'cbo', 'rfc', 'lcom', 'ca', 'ce', 'npm',
       'lcom3', 'loc', 'dam', 'moa', 'mfa', 'cam', 'ic', 'cbm', 'amc',
       'max_cc', 'avg_cc', 'bug'],
      dtype='object')

In [106]:
def pre_processing(file_list, folder_path, index):
    train_file = file_list[index]
    test_file = file_list[index + 1]
    
    # Đọc dữ liệu
    data_train = pd.read_csv(os.path.join(folder_path, train_file))
    data_test = pd.read_csv(os.path.join(folder_path, test_file))       

    # Tách features và labels
    X_train, y_train = data_train.drop(["name", "bug"], axis=1), data_train["bug"]
    X_test, y_test = data_test.drop(["name", "bug"], axis=1), data_test["bug"]

    # Chuyển đổi nhãn thành dạng nhị phân
    y_train = y_train.apply(lambda x: 1 if x != 0 else 0)
    y_test = y_test.apply(lambda x: 1 if x != 0 else 0)

    imputer = SimpleImputer(strategy='median')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    return X_train, y_train, X_test, y_test

In [123]:
def pre_processing_cross(dataset, folder_path):
    if(folder_path == "dataPROMISE/"):
        for col in dataset.columns:
            if dataset[col].dtype == "object":
                if col == "bug":
                    dataset[col] = dataset[col].astype(float)
                else:
                    dataset = dataset.drop([col], axis=1)

        X = dataset.drop(["bug"], axis=1)
        y = dataset["bug"].apply(lambda x: 1 if x != 0 else 0)
    if(folder_path == "dataNASA/"):
        onehot = {
            "N" : 0,
            "Y" : 1
        }
        dataset["Defective"] = dataset["Defective"].map(onehot)
        X = dataset.drop(["Defective"], axis= 1)
        y = dataset["Defective"]
    return  X, y

In [108]:
def Train(file_list, folder_path, index):
    X_train, y_train, X_test, y_test = pre_processing(file_list, folder_path, index)

    # Lựa chọn đặc trưng
    selector = SelectKBest(score_func=mutual_info_classif, k=10)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)

    # Xử lý mất cân bằng với Borderline-SMOTE
    smote = BorderlineSMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # Giảm chiều dữ liệu bằng PCA
    pca = PCA(n_components=min(10, X_train.shape[1]), random_state=42)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()), 
        ('ada', AdaBoostClassifier(random_state=42)),
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')), 
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB()) 
    ]

    # Định nghĩa mô hình meta 
    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model, 
        n_jobs=-1)

    # Huấn luyện mô hình stacking
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    return accuracy, f1, recall

In [109]:
def Train_Without_Feature_Selection(file_list, folder_path, index):
    X_train, y_train, X_test, y_test = pre_processing(file_list, folder_path, index)

    # Xử lý mất cân bằng lớp với Borderline-SMOTE
    smote = BorderlineSMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # Giảm chiều dữ liệu bằng PCA
    pca = PCA(n_components=min(10, X_train.shape[1]), random_state=42)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('ada', AdaBoostClassifier(random_state=42)),
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
        ('lgbm', LGBMClassifier(random_state=42)),
        ('nb', GaussianNB()) 
    ]

    # Định nghĩa mô hình meta (sử dụng Logistic Regression)
    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model, 
        n_jobs=-1)

    # Huấn luyện mô hình stacking
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    return accuracy, f1, recall

In [110]:
def Train_Without_Sampling(file_list, folder_path, index):
    X_train, y_train, X_test, y_test = pre_processing(file_list, folder_path, index)

    # Lựa chọn đặc trưng
    selector = SelectKBest(score_func=mutual_info_classif, k=10)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)

    # Giảm chiều dữ liệu bằng PCA
    pca = PCA(n_components=min(10, X_train.shape[1]), random_state=42)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('ada', AdaBoostClassifier(random_state=42)), 
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')), 
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB()) 
    ]

    # Định nghĩa mô hình meta 
    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model, 
        n_jobs=-1)

    # Huấn luyện mô hình stacking
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    return accuracy, f1, recall

In [111]:
def Train_Without_Weighted_Learning(file_list, folder_path, index):
    X_train, y_train, X_test, y_test = pre_processing(file_list, folder_path, index)

    # Lựa chọn đặc trưng
    selector = SelectKBest(score_func=mutual_info_classif, k=10)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)

    # Xử lý mất cân bằng lớp với Borderline-SMOTE
    smote = BorderlineSMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # Giảm chiều dữ liệu bằng PCA
    pca = PCA(n_components=min(10, X_train.shape[1]), random_state=42)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    base_models = [
        ('logreg', LogisticRegression(random_state=42)),
        ('svc', SVC(probability=True, random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('knn', KNeighborsClassifier()), 
        ('ada', AdaBoostClassifier(random_state=42)), 
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')), 
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB()) 
    ]

    # Định nghĩa mô hình meta 
    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model, 
        n_jobs=-1)

    # Huấn luyện mô hình stacking
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    return accuracy, f1, recall

In [112]:
def Train_Cross(file_list, folder_path):
    data_file = file_list[0]
    dataset = pd.read_csv(os.path.join(folder_path, data_file))

    # Xử lý dữ liệu
    X, y = pre_processing_cross(dataset, folder_path)

    # Áp dụng SMOTE để cân bằng dữ liệu
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Pipeline tiền xử lý dữ liệu
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('feature_selection', SelectKBest(score_func=mutual_info_classif)),
        ('pca', PCA(random_state=42))
    ])
    
    # Biến đổi dữ liệu
    X_transformed = pipeline.fit_transform(X_resampled, y_resampled)

    # Định nghĩa các mô hình cơ sở
    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('ada', AdaBoostClassifier(random_state=42)),
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
        ('lgbm', LGBMClassifier(random_state=42)),
        ('nb', GaussianNB())
    ]

    meta_model = LogisticRegression(random_state=42)

    # Sử dụng StackingClassifier
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model,
        n_jobs=-1
    )

    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score),
        'recall': make_scorer(recall_score)
    }

    # Tính các chỉ số đánh giá
    scores = cross_validate(stacking_clf, X_transformed, y_resampled, cv=kf, scoring=scoring, n_jobs=-1)

    acc_mean = scores['test_accuracy'].mean()
    f1_mean = scores['test_f1'].mean()
    recall_mean = scores['test_recall'].mean()

    return acc_mean, f1_mean, recall_mean

In [113]:
def Train_Cross_Without_Feature_Selection(file_list, folder_path):
    data_file = file_list[0]
    dataset = pd.read_csv(os.path.join(folder_path, data_file))

    # Xử lý dữ liệu
    X, y = pre_processing_cross(dataset, folder_path)


    # Cân bằng dữ liệu bằng SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # Tạo pipeline tiền xử lý
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('pca', PCA(random_state=42))
    ])
    
    # Biến đổi dữ liệu qua pipeline
    X_transformed = pipeline.fit_transform(X_resampled, y_resampled)

    # Định nghĩa các mô hình base
    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('ada', AdaBoostClassifier(random_state=42)),
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB())
    ]

    meta_model = LogisticRegression()

    # Stacking Classifier
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model,
        n_jobs=-1
    )

    # Cross-validation
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score),
        'recall': make_scorer(recall_score)
    }

    scores = cross_validate(stacking_clf, X_transformed, y_resampled, cv=kf, scoring=scoring, n_jobs=-1)

    acc_mean = scores['test_accuracy'].mean()
    f1_mean = scores['test_f1'].mean()
    recall_mean = scores['test_recall'].mean()

    return acc_mean, f1_mean, recall_mean

In [114]:
def Train_Cross_Without_Sampling(file_list, folder_path):
    data_file = file_list[0]
    dataset = pd.read_csv(os.path.join(folder_path, data_file))

    # Xử lý dữ liệu
    X, y = pre_processing_cross(dataset, folder_path)
    
    # Tạo pipeline tiền xử lý dữ liệu
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('feature_selection', SelectKBest(score_func=mutual_info_classif)),
        ('pca', PCA(random_state=42))
    ])
    
    # Biến đổi dữ liệu qua pipeline
    X_transformed = pipeline.fit_transform(X, y)

    # Định nghĩa các mô hình cơ sở
    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('ada', AdaBoostClassifier(random_state=42)),
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB())
    ]

    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier 
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model,
        n_jobs=-1
    )

    # Cross-validation setup
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score),
        'recall': make_scorer(recall_score)
    }

    # Tính các chỉ số đánh giá
    scores = cross_validate(stacking_clf, X_transformed, y, cv=kf, scoring=scoring, n_jobs=-1)

    acc_mean = scores['test_accuracy'].mean()
    f1_mean = scores['test_f1'].mean()
    recall_mean = scores['test_recall'].mean()

    return acc_mean, f1_mean, recall_mean

In [115]:
def Train_Cross_Without_Weighted_Learning(file_list, folder_path):
    data_file = file_list[0]
    dataset = pd.read_csv(os.path.join(folder_path, data_file))

    # Xử lý dữ liệu
    X, y = pre_processing_cross(dataset, folder_path)

    # Áp dụng SMOTE để cân bằng dữ liệu
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # Tạo pipeline tiền xử lý dữ liệu
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('feature_selection', SelectKBest(score_func=mutual_info_classif)),
        ('pca', PCA(random_state=42))
    ])
    
    # Biến đổi dữ liệu qua pipeline
    X_transformed = pipeline.fit_transform(X_resampled, y_resampled)

    # Định nghĩa các mô hình cơ sở (bỏ class_weight để tránh học có trọng số)
    base_models = [
        ('logreg', LogisticRegression(random_state=42)),
        ('svc', SVC(probability=True, random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('knn', KNeighborsClassifier()), 
        ('ada', AdaBoostClassifier(random_state=42)), 
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')), 
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB()) 
    ]

    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier 
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model,
        n_jobs=-1
    )

    # Cross-validation setup
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score),
        'recall': make_scorer(recall_score)
    }

    # Tính các chỉ số đánh giá
    scores = cross_validate(stacking_clf, X_transformed, y_resampled, cv=kf, scoring=scoring, n_jobs=-1)

    acc_mean = scores['test_accuracy'].mean()
    f1_mean = scores['test_f1'].mean()
    recall_mean = scores['test_recall'].mean()

    return acc_mean, f1_mean, recall_mean

In [116]:
results = []
results_without_feature_selection = []
results_without_sampling = []
results_without_weighted_learning = []

for project, versions in projects.items():
    project_scores = {"accuracy": [], "f1": [], "recall": []}
    project_scores_without_feature_selection = {"accuracy": [], "f1": [], "recall": []}
    project_scores_without_sampling = {"accuracy": [], "f1": [], "recall": []}
    project_scores_without_weighted_learning = {"accuracy": [], "f1": [], "recall": []}

    if len(versions) != 1:
        for i in range(len(versions) - 1):
            acc, f1, rec = Train(versions, folder_path, i)
            project_scores["accuracy"].append(acc)
            project_scores["f1"].append(f1)
            project_scores["recall"].append(rec)

            acc, f1, rec = Train_Without_Feature_Selection(versions, folder_path, i)
            project_scores_without_feature_selection["accuracy"].append(acc)
            project_scores_without_feature_selection["f1"].append(f1)
            project_scores_without_feature_selection["recall"].append(rec)

            acc, f1, rec = Train_Without_Sampling(versions, folder_path, i)
            project_scores_without_sampling["accuracy"].append(acc)
            project_scores_without_sampling["f1"].append(f1)
            project_scores_without_sampling["recall"].append(rec)

            acc, f1, rec = Train_Without_Weighted_Learning(versions, folder_path, i)
            project_scores_without_weighted_learning["accuracy"].append(acc)
            project_scores_without_weighted_learning["f1"].append(f1)
            project_scores_without_weighted_learning["recall"].append(rec)

        avg_scores = {key: np.mean(values) for key, values in project_scores.items()}
        avg_scores_without_feature_selection = {key: np.mean(values) for key, values in project_scores_without_feature_selection.items()}
        avg_scores_without_sampling = {key: np.mean(values) for key, values in project_scores_without_sampling.items()}
        avg_scores_without_weighted_learning = {key: np.mean(values) for key, values in project_scores_without_weighted_learning.items()}

    else:
        avg_scores = dict(zip(["accuracy", "f1", "recall"], Train_Cross(versions, folder_path)))
        avg_scores_without_feature_selection = dict(zip(["accuracy", "f1", "recall"], Train_Cross_Without_Feature_Selection(versions, folder_path)))
        avg_scores_without_sampling = dict(zip(["accuracy", "f1", "recall"], Train_Cross_Without_Sampling(versions, folder_path)))
        avg_scores_without_weighted_learning = dict(zip(["accuracy", "f1", "recall"], Train_Cross_Without_Weighted_Learning(versions, folder_path)))

    results.append({"Project": project, **avg_scores})
    results_without_feature_selection.append({"Project": project, **avg_scores_without_feature_selection})
    results_without_sampling.append({"Project": project, **avg_scores_without_sampling})
    results_without_weighted_learning.append({"Project": project, **avg_scores_without_weighted_learning})

avg_results = {key: np.mean([r[key] for r in results]) for key in ["accuracy", "f1", "recall"]}
avg_results_without_feature_selection = {key: np.mean([r[key] for r in results_without_feature_selection]) for key in ["accuracy", "f1", "recall"]}
avg_results_without_sampling = {key: np.mean([r[key] for r in results_without_sampling]) for key in ["accuracy", "f1", "recall"]}
avg_results_without_weighted_learning = {key: np.mean([r[key] for r in results_without_weighted_learning]) for key in ["accuracy", "f1", "recall"]}

results.append({"Project": "Avg", **avg_results})
results_without_feature_selection.append({"Project": "Avg", **avg_results_without_feature_selection})
results_without_sampling.append({"Project": "Avg", **avg_results_without_sampling})
results_without_weighted_learning.append({"Project": "Avg", **avg_results_without_weighted_learning})

In [117]:
df_results = pd.DataFrame(results)
df_results_without_feature_selection = pd.DataFrame(results_without_feature_selection)
df_results_without_sampling = pd.DataFrame(results_without_sampling)
df_results_without_weighted_learning = pd.DataFrame(results_without_weighted_learning)

output_file = 'dataPROMISE_results.xlsx'

with pd.ExcelWriter(output_file) as writer:
    df_results.to_excel(writer, sheet_name='Results', index=False)
    df_results_without_feature_selection.to_excel(writer, sheet_name='Results_Without_Feature_Selection', index=False)
    df_results_without_sampling.to_excel(writer, sheet_name='Results_Without_Sampling', index=False)
    df_results_without_weighted_learning.to_excel(writer, sheet_name='Results_Without_Weighted_Learning', index=False)



In [118]:
folder_path = "dataNASA/"
file_list_nasa = sorted([f for f in os.listdir(folder_path) if f.endswith('.csv')])
print(file_list_nasa)

['CM1.csv', 'KC1.csv', 'KC3.csv', 'MC1.csv', 'MC2.csv', 'MW1.csv', 'PC1.csv', 'PC3.csv', 'PC4.csv', 'PC5.csv']


In [119]:
data = pd.read_csv(os.path.join(folder_path, file_list_nasa[0]))
data.columns

Index(['id', 'LOC_BLANK', 'BRANCH_COUNT', 'CALL_PAIRS', 'LOC_CODE_AND_COMMENT',
       'LOC_COMMENTS', 'CONDITION_COUNT', 'CYCLOMATIC_COMPLEXITY',
       'CYCLOMATIC_DENSITY', 'DECISION_COUNT', 'DECISION_DENSITY',
       'DESIGN_COMPLEXITY', 'DESIGN_DENSITY', 'EDGE_COUNT',
       'ESSENTIAL_COMPLEXITY', 'ESSENTIAL_DENSITY', 'LOC_EXECUTABLE',
       'PARAMETER_COUNT', 'HALSTEAD_CONTENT', 'HALSTEAD_DIFFICULTY',
       'HALSTEAD_EFFORT', 'HALSTEAD_ERROR_EST', 'HALSTEAD_LENGTH',
       'HALSTEAD_LEVEL', 'HALSTEAD_PROG_TIME', 'HALSTEAD_VOLUME',
       'MAINTENANCE_SEVERITY', 'MODIFIED_CONDITION_COUNT',
       'MULTIPLE_CONDITION_COUNT', 'NODE_COUNT',
       'NORMALIZED_CYLOMATIC_COMPLEXITY', 'NUM_OPERANDS', 'NUM_OPERATORS',
       'NUM_UNIQUE_OPERANDS', 'NUM_UNIQUE_OPERATORS', 'NUMBER_OF_LINES',
       'PERCENT_COMMENTS', 'LOC_TOTAL', 'Defective'],
      dtype='object')

In [124]:
results = []
results_without_feature_selection = []
results_without_sampling = []
results_without_weighted_learning = []

for file in file_list_nasa:
    acc, f1, rec = Train_Cross([file], folder_path)
    acc_no_fs, f1_no_fs, rec_no_fs = Train_Cross_Without_Feature_Selection([file], folder_path)
    acc_no_samp, f1_no_samp, rec_no_samp = Train_Cross_Without_Sampling([file], folder_path)
    acc_no_weight, f1_no_weight, rec_no_weight = Train_Cross_Without_Weighted_Learning([file], folder_path)
    
    results.append({"Project": file, "Accuracy": acc, "F1_Score": f1, "Recall": rec})
    results_without_feature_selection.append({"Project": file, "Accuracy": acc_no_fs, "F1_Score": f1_no_fs, "Recall": rec_no_fs})
    results_without_sampling.append({"Project": file, "Accuracy": acc_no_samp, "F1_Score": f1_no_samp, "Recall": rec_no_samp})
    results_without_weighted_learning.append({"Project": file, "Accuracy": acc_no_weight, "F1_Score": f1_no_weight, "Recall": rec_no_weight})

avg_acc = np.mean([result["Accuracy"] for result in results])
avg_f1 = np.mean([result["F1_Score"] for result in results])
avg_rec = np.mean([result["Recall"] for result in results])

avg_acc_no_fs = np.mean([result["Accuracy"] for result in results_without_feature_selection])
avg_f1_no_fs = np.mean([result["F1_Score"] for result in results_without_feature_selection])
avg_rec_no_fs = np.mean([result["Recall"] for result in results_without_feature_selection])

avg_acc_no_samp = np.mean([result["Accuracy"] for result in results_without_sampling])
avg_f1_no_samp = np.mean([result["F1_Score"] for result in results_without_sampling])
avg_rec_no_samp = np.mean([result["Recall"] for result in results_without_sampling])

avg_acc_no_weight = np.mean([result["Accuracy"] for result in results_without_weighted_learning])
avg_f1_no_weight = np.mean([result["F1_Score"] for result in results_without_weighted_learning])
avg_rec_no_weight = np.mean([result["Recall"] for result in results_without_weighted_learning])

results.append({"Project": "Avg", "Accuracy": avg_acc, "F1_Score": avg_f1, "Recall": avg_rec})
results_without_feature_selection.append({"Project": "Avg", "Accuracy": avg_acc_no_fs, "F1_Score": avg_f1_no_fs, "Recall": avg_rec_no_fs})
results_without_sampling.append({"Project": "Avg", "Accuracy": avg_acc_no_samp, "F1_Score": avg_f1_no_samp, "Recall": avg_rec_no_samp})
results_without_weighted_learning.append({"Project": "Avg", "Accuracy": avg_acc_no_weight, "F1_Score": avg_f1_no_weight, "Recall": avg_rec_no_weight})

In [125]:
df_results = pd.DataFrame(results)
df_results_without_feature_selection = pd.DataFrame(results_without_feature_selection)
df_results_without_sampling = pd.DataFrame(results_without_sampling)
df_results_without_weighted_learning = pd.DataFrame(results_without_weighted_learning)

output_file = 'dataNASA_results.xlsx'

with pd.ExcelWriter(output_file) as writer:
    df_results.to_excel(writer, sheet_name='Results', index=False)
    df_results_without_feature_selection.to_excel(writer, sheet_name='Results_Without_Feature_Selection', index=False)
    df_results_without_sampling.to_excel(writer, sheet_name='Results_Without_Sampling', index=False)
    df_results_without_weighted_learning.to_excel(writer, sheet_name='Results_Without_Weighted_Learning', index=False)

