In [None]:
import warnings
warnings.filterwarnings('ignore')

# Table and CSV Creator

In [None]:
import pandas as pd

def create_result_table(method_name, gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best):
    # Create a dictionary with column names as keys and corresponding values
    data = {
        f'{method_name}': 'f1_score',
        'GaussianNB': [gauss_best],
        'KNN': [knn_best],
        'XGBoost': [xgb_best],
        'Random Forest': [rf_best],
        'SVM': [svm_best],
        'Logistic Regression': [log_best]
    }

    # Create a DataFrame from the dictionary and transpose it
    result_table = pd.DataFrame(data).transpose()
    
    # Set column names from the first row of the transposed DataFrame
    result_table.columns = result_table.iloc[0]
    
    # Remove the first row from the DataFrame
    result_table = result_table.iloc[1:]
    
    # Return the resulting DataFrame
    return result_table


In [None]:
import csv

def csv_creator(method_name):
    # Declare global variables for method_name, and the best scores for each method
    global gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best
    
    # Define the variable names and corresponding values to be written to the CSV file
    variables = ['Method', 'gauss_best', 'knn_best', 'xgb_best', 'rf_best', 'svm_best', 'log_best']
    values = [method_name, f'{gauss_best:.3f}', f'{knn_best:.3f}', f'{xgb_best:.3f}', f'{rf_best:.3f}', f'{svm_best:.3f}', f'{log_best:.3f}']

    # Specify the CSV file path based on the method_name
    csv_filename = f'model_data/{method_name}/{method_name}_values.csv'
    
    # Open the CSV file in write mode, and create a CSV writer
    with open(csv_filename, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        
        # Write the header (variable names) to the CSV file
        csv_writer.writerow(variables)
        
        # Write the values to the CSV file
        csv_writer.writerow(values)

# Model Practitioner

In [None]:
import os
from shutil import rmtree
from joblib import dump
from sklearn.metrics import f1_score

def save_data(X_train, X_test, y_train, y_test, f1_value, folder_path):
    # Convert dataset to a DataFrame and save it as a CSV file in the specified folder
    pd.DataFrame(X_train).to_csv(f'{folder_path}/{f1_value:.3f}_X_train.csv', index=False)
    pd.DataFrame(X_test).to_csv(f'{folder_path}/{f1_value:.3f}_X_test.csv', index=False)
    pd.DataFrame(y_train).to_csv(f'{folder_path}/{f1_value:.3f}_y_train.csv', index=False)
    pd.DataFrame(y_test).to_csv(f'{folder_path}/{f1_value:.3f}_y_test.csv', index=False)

def train_and_save_model(model, method_name, X_train, X_test, y_train, y_test, model_name, model_best):
    model_instance = model.fit(X_train, y_train) # Train the model on the training data
    
    y_test_pred = model_instance.predict(X_test) # Make predictions on the test set
    
    f1_value = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1 score for the model's predictions
    
    # Check if the current model has a better F1 score than the previous best
    if f1_value > model_best:
        folder_path = f'model_data/{method_name}/{model_name.lower()}' # Define the folder path for saving model-related data
        
        # Check if the folder already exists; if yes, remove it
        if os.path.exists(folder_path):
            rmtree(folder_path)
        
        os.makedirs(folder_path) # Create a new folder for the model data
        
        # Save the training and testing data, and the model itself
        save_data(X_train, X_test, y_train, y_test, f1_value, folder_path)
        dump(model_instance, f'{folder_path}/{model_name.lower()}-{f1_value:.3f}.joblib')
        
        # Update the model_best variable with the new best F1 value
        model_best = f1_value
    
    # Return the model_best value
    return model_best


# Models

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

def logistic_regression(method_name, X_train, X_test, y_train, y_test):
    global log_best # Declare the log_best variable as a global variable
    
    logreg = LogisticRegression() # Create a Logistic Regression model instance
    
    # Train the model and save its performance metrics
    log_best = train_and_save_model(logreg, method_name, X_train, X_test, y_train.ravel(), y_test, 'Logistic Regression', log_best)

def logistic_regression_tuned(method_name, X_train, X_test, y_train, y_test):
    global log_best
    
    # Create a Logistic Regression model instance with specific hyperparameters
    logreg = LogisticRegression(penalty='l2', C=1.0)
    
    log_best = train_and_save_model(logreg, method_name, X_train, X_test, y_train.ravel(), y_test, 'Logistic Regression', log_best)


## XGB

In [None]:
from xgboost import XGBClassifier

def xgb_classifier(method_name, X_train, X_test, y_train, y_test):
    global xgb_best # Declare the xgb_best variable as a global variable

    xgb = XGBClassifier() # Create an XGBoost model instance

    # Train the model and save its performance metrics
    xgb_best = train_and_save_model(xgb, method_name, X_train, X_test, y_train, y_test, 'XGB Classifier', xgb_best)

def xgb_classifier_tuned(method_name, X_train, X_test, y_train, y_test):
    global xgb_best

    # Create an XGBoost model instance with specific hyperparameters
    xgb = XGBClassifier(n_estimators = 50, learning_rate = 0.01, max_depth = 3, subsample = 0.8, colsample_bytree = 1.0)
    
    xgb_best = train_and_save_model(xgb, method_name, X_train, X_test, y_train, y_test, 'XGB Classifier', xgb_best)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def random_forest_classifier(method_name, X_train, X_test, y_train, y_test):
    global rf_best # Declare the rf_best variable as a global variable

    rf = RandomForestClassifier() # Create a Random Forest model instance

    # Train the model and save its performance metrics
    rf_best = train_and_save_model(rf, method_name, X_train, X_test, y_train.ravel(), y_test, 'Random Forest Classifier', rf_best)

def random_forest_classifier_tuned(method_name, X_train, X_test, y_train, y_test):
    global rf_best

    # Create a Random Forest model instance with specific hyperparameters
    rf = RandomForestClassifier(min_samples_split=5, n_estimators=50)

    rf_best = train_and_save_model(rf, method_name, X_train, X_test, y_train.ravel(), y_test, 'Random Forest Classifier', rf_best)


## SVM

In [None]:
from sklearn.svm import SVC

def svm(method_name, X_train, X_test, y_train, y_test):
    global svm_best # Declare the svm_best variable as a global variable

    svm = SVC() # Create an SVM model instance
    
    # Train the model and save its performance metrics
    svm_best = train_and_save_model(svm, method_name, X_train, X_test, y_train.ravel(), y_test, 'SVM', svm_best)

def svm_tuned(method_name, X_train, X_test, y_train, y_test):
    global svm_best

    # Create an SVM model instance with specific hyperparameters
    svm = SVC(C=1, kernel='linear')

    svm_best = train_and_save_model(svm, method_name, X_train, X_test, y_train.ravel(), y_test, 'SVM', svm_best)

## Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

def gaussian_nb(method_name, X_train, X_test, y_train, y_test):
    global gauss_best # Declare the gauss_best variable as a global variable

    gauss = GaussianNB() # Create a Gaussian Naive Bayes model instance

    # Train the model and save its performance metrics
    gauss_best = train_and_save_model(gauss, method_name, X_train, X_test, y_train.ravel(), y_test, 'GaussianNB', gauss_best)

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def k_neigbors_classifier(method_name, X_train, X_test, y_train, y_test):
    global knn_best # Declare the knn_best variable as a global variable

    knn = KNeighborsClassifier() # Create a K-Nearest Neighbors model instance

    # Train the model and save its performance metrics
    knn_best = train_and_save_model(knn, method_name, X_train, X_test, y_train.ravel(), y_test, 'KNeighborsClassifier', knn_best)

def k_neigbors_classifier_tuned(method_name, X_train, X_test, y_train, y_test):
    global knn_best

    # Create a K-Nearest Neighbors model instance with specific hyperparameters
    knn = KNeighborsClassifier(leaf_size=10, n_neighbors=3, p=1)

    knn_best = train_and_save_model(knn, method_name, X_train, X_test, y_train.ravel(), y_test, 'KNeighborsClassifier', knn_best)

# Dataset Preparator

## Get Best F1 Score For Every Model

In [None]:
def get_best_f1_score(method_name):
    csv_file_path = f"model_data/{method_name}/{method_name}_values.csv"

    # Check if the directory exists, if not, create it
    directory = os.path.dirname(csv_file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Check if CSV file exists
    if not os.path.exists(csv_file_path):
        # Create the CSV file with headers and default values
        data = {
            "Method": [method_name],
            "gauss_best": [0],
            "knn_best": [0],
            "xgb_best": [0],
            "rf_best": [0],
            "svm_best": [0],
            "log_best": [0]
        }
        df = pd.DataFrame(data)
        df.to_csv(csv_file_path, index=False)
    
    # Read the existing CSV file
    csv_file = pd.read_csv(csv_file_path)

    # Select relevant columns
    selected_columns = ["gauss_best", "knn_best", "xgb_best", "rf_best", "svm_best", "log_best"]
    selected_data = csv_file[selected_columns]

    # Extract best values
    gauss_best = selected_data["gauss_best"].iloc[0]
    knn_best = selected_data["knn_best"].iloc[0]
    xgb_best = selected_data["xgb_best"].iloc[0]
    rf_best = selected_data["rf_best"].iloc[0]
    svm_best = selected_data["svm_best"].iloc[0]
    log_best = selected_data["log_best"].iloc[0]

    return gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best

## Data Preprocessing

In [None]:
def data_preprocessing(df):
    for row in df.columns:
        # Calculate the distribution and percent distribution of values ​​in the current column
        data_distribution = df[row].value_counts() 
        distribution_percentage = data_distribution / data_distribution.sum() * 100
        
        # Check if the maximum percentage is greater than 80% and delete the column if greater
        if distribution_percentage.max() > 80 and row != "Number of Bugs":
            df.drop(columns=[row], inplace=True)
            
    # Drop specified columns 'Hash' and 'LongName'
    df.drop(columns=['Hash'], inplace=True)
    df.drop(columns=['LongName'], inplace=True)

    # Transform 'Number of Bugs' column values to 1 if greater than 1, else keep the same value
    df["Number of Bugs"] = df["Number of Bugs"].apply(lambda x: 1 if x > 1 else x)

    # Calculate the correlation of each feature with the last column and drop columns with low correlation
    correlation_with_last_column = df.corr().iloc[:-1, -1].abs()
    df = df.drop(columns=correlation_with_last_column[correlation_with_last_column < 0.01].index)

    # Shuffle the DataFrame and reset the index
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Keep the 3000 rows if the DataFrame has more than 3000 rows
    df = df.head(3000) if len(df) > 3000 else df

    return df

## Main Function

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

def main(df, method_name):
    # Separate features (X) and target variable (y)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1].values.reshape(-1, 1)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=144, shuffle=True)
    
    # Apply Min-Max scaling to the features
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)

    # Perform original and tuned models
    logistic_regression(method_name, X_train_scaled, X_test_scaled, y_train, y_test)
    logistic_regression_tuned(method_name, X_train_scaled, X_test_scaled, y_train, y_test)
    xgb_classifier(method_name, X_train_scaled, X_test_scaled, y_train, y_test)
    xgb_classifier_tuned(method_name, X_train_scaled, X_test_scaled, y_train, y_test)
    random_forest_classifier(method_name, X_train_scaled, X_test_scaled, y_train, y_test)
    random_forest_classifier_tuned(method_name, X_train_scaled, X_test_scaled, y_train, y_test)
    svm(method_name, X_train_scaled, X_test_scaled, y_train, y_test)
    svm_tuned(method_name, X_train_scaled, X_test_scaled, y_train, y_test)
    gaussian_nb(method_name, X_train, X_test, y_train, y_test)
    k_neigbors_classifier(method_name, X_train_scaled, X_test_scaled, y_train, y_test)
    k_neigbors_classifier_tuned(method_name, X_train_scaled, X_test_scaled, y_train, y_test)

    # Create and save CSV files with the results
    csv_creator(method_name)

# Methods

## Android Universal Image Loader

In [None]:
df= pd.read_csv('data/Android-Universal-Image-Loader/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("auil")

# Training and evaluation process to process a minimum of 50 times or a total of 150000 rows of data
for _ in range(max(50,150000 // len(df))):
    main(df, "auil")

create_result_table("auil", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## Antlr v4

In [None]:
df = pd.read_csv('data/antlr4/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("antlr4")

# Training and evaluation process to process a minimum of 50 times or a total of 150000 rows of data
for _ in range(max(50,150000 // len(df))):
    main(df, "antlr4")

create_result_table("antlr4", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## Broadleaf Commerce

In [None]:
df = pd.read_csv('data/BroadleafCommerce/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("BroadleafCommerce")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "BroadleafCommerce")

create_result_table("BroadleafCommerce", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## Ceylon IDE Eclipse

In [None]:
df = pd.read_csv('data/ceylon-ide-eclipse/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("ceylon-ide-eclipse")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "ceylon-ide-eclipse")

create_result_table("ceylon-ide-eclipse", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## Elastic Search

In [None]:
df = pd.read_csv('data/elasticsearch/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("elastic-search")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "elastic-search")

create_result_table("elastic-search", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## Hazelcast

In [None]:
df = pd.read_csv('data/hazelcast/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("hazelcast")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "hazelcast")

create_result_table("hazelcast", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## jUnit

In [None]:
df = pd.read_csv('data/junit/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("jUnit")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "jUnit")

create_result_table("jUnit", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## MapDB

In [None]:
df = pd.read_csv('data/MapDB/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("MapDB")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "MapDB")

create_result_table("MapDB", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## mcMMO

In [None]:
df = pd.read_csv('data/mcMMO/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("mcMMO")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "mcMMO")

create_result_table("mcMMO", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## MCT

In [None]:
df = pd.read_csv('data/mct/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("MCT")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "MCT")

create_result_table("MCT", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## neo4J

In [None]:
df = pd.read_csv('data/neo4j/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("neo4J")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "neo4J")

create_result_table("neo4J", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## Netty

In [None]:
df = pd.read_csv('data/netty/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("Netty")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "Netty")

create_result_table("Netty", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## orientDB

In [None]:
df = pd.read_csv('data/orientdb/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("orientDB")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "orientDB")

create_result_table("orientDB", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## Oryx

In [None]:
df = pd.read_csv('data/oryx/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("Oryx")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "Oryx")

create_result_table("Oryx", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)

## Titan

In [None]:
df = pd.read_csv('data/titan/class.csv')

df = data_preprocessing(df)

gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best = get_best_f1_score("Titan")

# Repeat the training and evaluation process a maximum of 50 times or until a certain dataset size is reached
for _ in range(max(50,150000 // len(df))):
    main(df, "Titan")

create_result_table("Titan", gauss_best, knn_best, xgb_best, rf_best, svm_best, log_best)