In [1]:
import pandas as pd
import os
from sklearn.impute import SimpleImputer

In [2]:
folder_path = "dataPROMISE/"
files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
ant_files = sorted([f for f in files if f.startswith('')])
file_list = os.listdir(folder_path)
print(file_list)

['ant-1.3.csv', 'ant-1.4.csv', 'ant-1.5.csv', 'ant-1.6.csv', 'ant-1.7.csv', 'arc.csv', 'camel-1.0.csv', 'camel-1.2.csv', 'camel-1.4.csv', 'camel-1.6.csv', 'ivy-1.0.csv', 'ivy-1.1.csv', 'ivy-1.2.csv', 'jedit-3.2.csv', 'jedit-4.0.csv', 'jedit-4.1.csv', 'jedit-4.2.csv', 'jedit-4.3.csv', 'log4j-1.0.csv', 'log4j-1.1.csv', 'log4j-1.2.csv', 'lucene-2.0.csv', 'lucene-2.2.csv', 'lucene-2.4.csv', 'poi-1.5.csv', 'poi-2.0.csv', 'poi-2.5.csv', 'poi-3.0.csv', 'prop-6.csv', 'redaktor.csv', 'synapse-1.0.csv', 'synapse-1.1.csv', 'synapse-1.2.csv', 'tomcat.csv', 'velocity-1.4.csv', 'velocity-1.5.csv', 'velocity-1.6.csv', 'xalan-2.4.csv', 'xalan-2.5.csv', 'xalan-2.6.csv', 'xalan-2.7.csv', 'xerces-1.1.csv', 'xerces-1.2.csv', 'xerces-1.3.csv', 'xerces-1.4.4.csv']


In [3]:
projects = {}
for file in file_list:
    project_name = "-".join(file.split("-")[:-1])
    if project_name == '':
        project_name = file.split('.')[0] 
    if project_name not in projects:
        projects[project_name] = []
    projects[project_name].append(file)
print(projects)

{'ant': ['ant-1.3.csv', 'ant-1.4.csv', 'ant-1.5.csv', 'ant-1.6.csv', 'ant-1.7.csv'], 'arc': ['arc.csv'], 'camel': ['camel-1.0.csv', 'camel-1.2.csv', 'camel-1.4.csv', 'camel-1.6.csv'], 'ivy': ['ivy-1.0.csv', 'ivy-1.1.csv', 'ivy-1.2.csv'], 'jedit': ['jedit-3.2.csv', 'jedit-4.0.csv', 'jedit-4.1.csv', 'jedit-4.2.csv', 'jedit-4.3.csv'], 'log4j': ['log4j-1.0.csv', 'log4j-1.1.csv', 'log4j-1.2.csv'], 'lucene': ['lucene-2.0.csv', 'lucene-2.2.csv', 'lucene-2.4.csv'], 'poi': ['poi-1.5.csv', 'poi-2.0.csv', 'poi-2.5.csv', 'poi-3.0.csv'], 'prop': ['prop-6.csv'], 'redaktor': ['redaktor.csv'], 'synapse': ['synapse-1.0.csv', 'synapse-1.1.csv', 'synapse-1.2.csv'], 'tomcat': ['tomcat.csv'], 'velocity': ['velocity-1.4.csv', 'velocity-1.5.csv', 'velocity-1.6.csv'], 'xalan': ['xalan-2.4.csv', 'xalan-2.5.csv', 'xalan-2.6.csv', 'xalan-2.7.csv'], 'xerces': ['xerces-1.1.csv', 'xerces-1.2.csv', 'xerces-1.3.csv', 'xerces-1.4.4.csv']}


In [4]:
def pre_processing(file_list, folder_path, index):
    train_file = file_list[index]
    test_file = file_list[index + 1]
    
    # Đọc dữ liệu
    data_train = pd.read_csv(os.path.join(folder_path, train_file))
    data_test = pd.read_csv(os.path.join(folder_path, test_file))       

    # Tách features và labels
    X_train, y_train = data_train.drop(["name", "bug"], axis=1), data_train["bug"]
    X_test, y_test = data_test.drop(["name", "bug"], axis=1), data_test["bug"]

    # Chuyển đổi nhãn thành dạng nhị phân
    y_train = y_train.apply(lambda x: 1 if x != 0 else 0)
    y_test = y_test.apply(lambda x: 1 if x != 0 else 0)

    imputer = SimpleImputer(strategy='median')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    return X_train, y_train, X_test, y_test

In [5]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

def Train(file_list, folder_path, index):
    X_train, y_train, X_test, y_test = pre_processing(file_list, folder_path, index)

    # Lựa chọn đặc trưng
    selector = SelectKBest(score_func=mutual_info_classif, k=10)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)

    # Xử lý mất cân bằng lớp với Borderline-SMOTE
    smote = BorderlineSMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # Giảm chiều dữ liệu bằng PCA
    pca = PCA(n_components=min(10, X_train.shape[1]), random_state=42)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()),  # Thêm KNN
        ('ada', AdaBoostClassifier(random_state=42)),  # Thêm AdaBoost
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),  # Thêm XGBoost
        ('lgbm', LGBMClassifier(random_state=42)),  # Thêm LightGBM
        ('nb', GaussianNB())  # Thêm Naive Bayes
    ]

    # Định nghĩa mô hình meta (sử dụng Logistic Regression)
    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model, 
        n_jobs=-1)

    # Huấn luyện mô hình stacking
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)

    # Tính toán và trả về điểm F1
    return f1_score(y_test, y_pred)


In [6]:
def Train_Without_Feature_Selection(file_list, folder_path, index):
    X_train, y_train, X_test, y_test = pre_processing(file_list, folder_path, index)

    # Xử lý mất cân bằng lớp với Borderline-SMOTE
    smote = BorderlineSMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # Giảm chiều dữ liệu bằng PCA
    pca = PCA(n_components=min(10, X_train.shape[1]), random_state=42)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('ada', AdaBoostClassifier(random_state=42)),
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
        ('lgbm', LGBMClassifier(random_state=42)),
        ('nb', GaussianNB()) 
    ]

    # Định nghĩa mô hình meta (sử dụng Logistic Regression)
    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model, 
        n_jobs=-1)

    # Huấn luyện mô hình stacking
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)

    # Tính toán và trả về điểm F1
    return f1_score(y_test, y_pred)

In [7]:
def Train_Without_Sampling(file_list, folder_path, index):
    X_train, y_train, X_test, y_test = pre_processing(file_list, folder_path, index)

    # Lựa chọn đặc trưng
    selector = SelectKBest(score_func=mutual_info_classif, k=10)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)

    # Giảm chiều dữ liệu bằng PCA
    pca = PCA(n_components=min(10, X_train.shape[1]), random_state=42)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('ada', AdaBoostClassifier(random_state=42)), 
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')), 
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB()) 
    ]

    # Định nghĩa mô hình meta (sử dụng Logistic Regression)
    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model, 
        n_jobs=-1)

    # Huấn luyện mô hình stacking
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)

    # Tính toán và trả về điểm F1
    return f1_score(y_test, y_pred)

In [8]:
def Train_Without_Weighted_Learning(file_list, folder_path, index):
    X_train, y_train, X_test, y_test = pre_processing(file_list, folder_path, index)

    # Lựa chọn đặc trưng
    selector = SelectKBest(score_func=mutual_info_classif, k=10)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)

    # Xử lý mất cân bằng lớp với Borderline-SMOTE
    smote = BorderlineSMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # Giảm chiều dữ liệu bằng PCA
    pca = PCA(n_components=min(10, X_train.shape[1]), random_state=42)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    base_models = [
        ('logreg', LogisticRegression(random_state=42)),
        ('svc', SVC(probability=True, random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('knn', KNeighborsClassifier()), 
        ('ada', AdaBoostClassifier(random_state=42)), 
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')), 
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB()) 
    ]

    # Định nghĩa mô hình meta (sử dụng Logistic Regression)
    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model, 
        n_jobs=-1)

    # Huấn luyện mô hình stacking
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)

    return f1_score(y_test, y_pred)

In [9]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd
import os

def Train_Cross(file_list, folder_path):
    data_file = file_list[0]
    dataset = pd.read_csv(os.path.join(folder_path, data_file))

    # Xử lý dữ liệu
    for col in dataset.columns:
        if dataset[col].dtype == "object":
            if col == "bug":
                dataset[col] = dataset[col].astype(float)
            else:
                dataset = dataset.drop([col], axis=1)

    X = dataset.drop(["bug"], axis=1)
    y = dataset["bug"].apply(lambda x: 1 if x != 0 else 0)

    # Tạo pipeline cho việc tiền xử lý dữ liệu
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('feature_selection', SelectKBest(score_func=mutual_info_classif)),
        ('pca', PCA(random_state=42))
    ])
    
    # Cross-validation với SMOTE chỉ áp dụng lên tập huấn luyện
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    
    f1_scores = []  # Lưu trữ điểm F1 của mỗi lần phân chia
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Áp dụng SMOTE chỉ trên tập huấn luyện
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        
        # Biến đổi dữ liệu qua pipeline
        X_train_transformed = pipeline.fit_transform(X_train_resampled, y_train_resampled)
        X_test_transformed = pipeline.transform(X_test)  # Chỉ biến đổi dữ liệu test

        # Định nghĩa các mô hình cơ sở
        base_models = [
            ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
            ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
            ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
            ('gb', GradientBoostingClassifier(random_state=42)),
            ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
            ('knn', KNeighborsClassifier()),
            ('ada', AdaBoostClassifier(random_state=42)),
            ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
            ('lgbm', LGBMClassifier(random_state=42)),
            ('nb', GaussianNB())
        ]

        meta_model = LogisticRegression()

        # Sử dụng StackingClassifier
        stacking_clf = StackingClassifier(
            estimators=base_models, 
            final_estimator=meta_model,
            n_jobs=-1
        )

        # Huấn luyện mô hình trên tập huấn luyện đã resample
        stacking_clf.fit(X_train_transformed, y_train_resampled)

        # Dự đoán trên tập kiểm tra
        from sklearn.metrics import f1_score
        y_pred = stacking_clf.predict(X_test_transformed)
        
        # Tính toán điểm F1 trên tập kiểm tra
        f1 = f1_score(y_test, y_pred)
        f1_scores.append(f1)

    # Trả về điểm F1 trung bình qua các vòng cross-validation
    return sum(f1_scores) / len(f1_scores)

In [10]:
def Train_Cross_Without_Feature_Selection(file_list, folder_path):
    data_file = file_list[0]
    dataset = pd.read_csv(os.path.join(folder_path, data_file))

    for col in dataset.columns:
        if dataset[col].dtype == "object":
            if col == "bug":
                dataset[col] = dataset[col].astype(float)
            else:
                dataset = dataset.drop([col], axis=1)
    X = dataset.drop(["bug"], axis=1)
    y = dataset["bug"].apply(lambda x: 1 if x != 0 else 0)

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # Tạo pipeline cho dữ liệu đã resample
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('pca', PCA(random_state=42))
    ])
    
    # Biến đổi dữ liệu qua pipeline
    X_transformed = pipeline.fit_transform(X_resampled, y_resampled)

    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('ada', AdaBoostClassifier(random_state=42)),
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB())
    ]

    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier 
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model,
        n_jobs=-1
    )

    # Cross-validation
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    scores = cross_val_score(stacking_clf, X_transformed, y_resampled, cv=kf, scoring='f1', n_jobs=-1)

    return scores.mean()

In [11]:
def Train_Cross_Without_Sampling(file_list, folder_path):
    data_file = file_list[0]
    dataset = pd.read_csv(os.path.join(folder_path, data_file))

    for col in dataset.columns:
        if dataset[col].dtype == "object":
            if col == "bug":
                dataset[col] = dataset[col].astype(float)
            else:
                dataset = dataset.drop([col], axis=1)
    X = dataset.drop(["bug"], axis=1)
    y = dataset["bug"].apply(lambda x: 1 if x != 0 else 0)
    
    # Tạo pipeline cho dữ liệu đã resample
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('feature_selection', SelectKBest(score_func=mutual_info_classif)),
        ('pca', PCA(random_state=42))
    ])
    
    # Biến đổi dữ liệu qua pipeline
    X_transformed = pipeline.fit_transform(X, y)

    base_models = [
        ('logreg', LogisticRegression(class_weight='balanced', random_state=42)),
        ('svc', SVC(class_weight='balanced', probability=True, random_state=42)),
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('ada', AdaBoostClassifier(random_state=42)),
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB())
    ]

    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier 
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model,
        n_jobs=-1
    )

    # Cross-validation
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    scores = cross_val_score(stacking_clf, X_transformed, y, cv=kf, scoring='f1', n_jobs=-1)

    return scores.mean()

In [12]:
def Train_Cross_Without_Weighted_Learning(file_list, folder_path):
    data_file = file_list[0]
    dataset = pd.read_csv(os.path.join(folder_path, data_file))

    for col in dataset.columns:
        if dataset[col].dtype == "object":
            if col == "bug":
                dataset[col] = dataset[col].astype(float)
            else:
                dataset = dataset.drop([col], axis=1)
    X = dataset.drop(["bug"], axis=1)
    y = dataset["bug"].apply(lambda x: 1 if x != 0 else 0)

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # Tạo pipeline cho dữ liệu đã resample
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('feature_selection', SelectKBest(score_func=mutual_info_classif)),
        ('pca', PCA(random_state=42))
    ])
    
    # Biến đổi dữ liệu qua pipeline
    X_transformed = pipeline.fit_transform(X_resampled, y_resampled)

    base_models = [
        ('logreg', LogisticRegression(random_state=42)),
        ('svc', SVC(probability=True, random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('knn', KNeighborsClassifier()), 
        ('ada', AdaBoostClassifier(random_state=42)), 
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')), 
        ('lgbm', LGBMClassifier(random_state=42)), 
        ('nb', GaussianNB()) 
    ]

    meta_model = LogisticRegression()

    # Sử dụng StackingClassifier 
    stacking_clf = StackingClassifier(
        estimators=base_models, 
        final_estimator=meta_model,
        n_jobs=-1
    )

    # Cross-validation
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    scores = cross_val_score(stacking_clf, X_transformed, y_resampled, cv=kf, scoring='f1', n_jobs=-1)

    return scores.mean()

In [13]:
import numpy as np
results = []
results_without_feature_selection = []
results_without_sampling = []
results_without_weighted_learning = []
for project, versions in projects.items():
    project_scores = []
    project_scores_without_feature_selection = []
    project_scores_without_sampling = []
    project_scores_without_weighted_learning = []
    if len(versions) != 1:
        for i in range(len(versions) - 1):
            project_scores.append(Train(versions, folder_path, i))
            project_scores_without_feature_selection.append(Train_Without_Feature_Selection(versions, folder_path, i))
            project_scores_without_sampling.append(Train_Without_Sampling(versions, folder_path, i))
            project_scores_without_weighted_learning.append(Train_Without_Weighted_Learning(versions, folder_path, i))

        avg_score = np.mean(project_scores) 
        avg_score_without_feature_selection = np.mean(project_scores_without_feature_selection)
        avg_score_without_sampling = np.mean(project_scores_without_sampling)
        avg_score_without_weighted_learning = np.mean(project_scores_without_weighted_learning)

        results.append({"Project": project, "F1_Score": avg_score})
        results_without_feature_selection.append({"Project": project, "F1_Score": avg_score_without_feature_selection})
        results_without_sampling.append({"Project": project, "F1_Score": avg_score_without_sampling})
        results_without_weighted_learning.append({"Project": project, "F1_Score": avg_score_without_weighted_learning})
    else:
        avg_score = Train_Cross(versions, folder_path)
        avg_score_without_feature_selection = Train_Cross_Without_Feature_Selection(versions, folder_path)
        avg_score_without_sampling = Train_Cross_Without_Sampling(versions, folder_path)
        avg_score_without_weighted_learning = Train_Cross_Without_Weighted_Learning(versions, folder_path)
        
        results.append({"Project": project, "F1_Score": avg_score})
        results_without_feature_selection.append({"Project": project, "F1_Score": avg_score_without_feature_selection})
        results_without_sampling.append({"Project": project, "F1_Score": avg_score_without_sampling})
        results_without_weighted_learning.append({"Project": project, "F1_Score": avg_score_without_weighted_learning})

avg_results = np.mean([result["F1_Score"] for result in results])
avg_results_without_feature_selection = np.mean([result["F1_Score"] for result in results_without_feature_selection])
avg_results_without_sampling = np.mean([result["F1_Score"] for result in results_without_sampling])
avg_results_without_weighted_learning = np.mean([result["F1_Score"] for result in results_without_weighted_learning])

results.append({"Project": "Avg", "F1_Score": avg_results})
results_without_feature_selection.append({"Project": "Avg", "F1_Score": avg_results_without_feature_selection})
results_without_sampling.append({"Project": "Avg", "F1_Score": avg_results_without_sampling})
results_without_weighted_learning.append({"Project": "Avg", "F1_Score": avg_results_without_weighted_learning})

KeyboardInterrupt: 

In [164]:
df_results = pd.DataFrame(results)
df_results_without_feature_selection = pd.DataFrame(results_without_feature_selection)
df_results_without_sampling = pd.DataFrame(results_without_sampling)
df_results_without_weighted_learning = pd.DataFrame(results_without_weighted_learning)

output_file = 'dataPROMISE_results.xlsx'

with pd.ExcelWriter(output_file) as writer:
    df_results.to_excel(writer, sheet_name='Results', index=False)
    df_results_without_feature_selection.to_excel(writer, sheet_name='Results_Without_Feature_Selection', index=False)
    df_results_without_sampling.to_excel(writer, sheet_name='Results_Without_Sampling', index=False)
    df_results_without_weighted_learning.to_excel(writer, sheet_name='Results_Without_Weighted_Learning', index=False)



In [165]:
folder_path_nasa = "dataNASA/"
files_nasa = [f for f in os.listdir(folder_path_nasa) if f.endswith('.csv')]
ant_files_nasa = sorted([f for f in files if f.startswith('')])
file_list_nasa = os.listdir(folder_path_nasa)
print(file_list_nasa)

['CM1.csv', 'KC1.csv', 'KC3.csv', 'MC1.csv', 'MC2.csv', 'MW1.csv', 'PC1.csv', 'PC3.csv', 'PC4.csv', 'PC5.csv']


In [149]:
results = []
results_without_feature_selection = []
results_without_sampling = []
results_without_weighted_learning = []
for file in file_list_nasa:
        avg_score = Train_Cross(versions, folder_path)
        avg_score_without_feature_selection = Train_Cross_Without_Feature_Selection(versions, folder_path)
        avg_score_without_sampling = Train_Cross_Without_Sampling(versions, folder_path)
        avg_score_without_weighted_learning = Train_Cross_Without_Weighted_Learning(versions, folder_path)
        
        results.append({"Project": file, "F1_Score": avg_score})
        results_without_feature_selection.append({"Project": file, "F1_Score": avg_score_without_feature_selection})
        results_without_sampling.append({"Project": file, "F1_Score": avg_score_without_sampling})
        results_without_weighted_learning.append({"Project": file, "F1_Score": avg_score_without_weighted_learning})

avg_results = np.mean([result["F1_Score"] for result in results])
avg_results_without_feature_selection = np.mean([result["F1_Score"] for result in results_without_feature_selection])
avg_results_without_sampling = np.mean([result["F1_Score"] for result in results_without_sampling])
avg_results_without_weighted_learning = np.mean([result["F1_Score"] for result in results_without_weighted_learning])

results.append({"Project": "Avg", "F1_Score": avg_results})
results_without_feature_selection.append({"Project": "Avg", "F1_Score": avg_results_without_feature_selection})
results_without_sampling.append({"Project": "Avg", "F1_Score": avg_results_without_sampling})
results_without_weighted_learning.append({"Project": "Avg", "F1_Score": avg_results_without_weighted_learning})

In [150]:
df_results = pd.DataFrame(results)
df_results_without_feature_selection = pd.DataFrame(results_without_feature_selection)
df_results_without_sampling = pd.DataFrame(results_without_sampling)
df_results_without_weighted_learning = pd.DataFrame(results_without_weighted_learning)

output_file = 'dataNASA_results.xlsx'

with pd.ExcelWriter(output_file) as writer:
    df_results.to_excel(writer, sheet_name='Results', index=False)
    df_results_without_feature_selection.to_excel(writer, sheet_name='Results_Without_Feature_Selection', index=False)
    df_results_without_sampling.to_excel(writer, sheet_name='Results_Without_Sampling', index=False)
    df_results_without_weighted_learning.to_excel(writer, sheet_name='Results_Without_Weighted_Learning', index=False)

