In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef


from sklearn import svm
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import wilcoxon
import numpy as np
import pandas as pd
from sklearn.metrics import make_scorer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Decision Tree

In [None]:
LargeClassSmell = "/content/drive/MyDrive/Colab files/Python_LargeClassSmell_Dataset.csv"

In [None]:
def Decision_Tree_k10(LargeClassSmell):
    global DT_Scores
    global DT_accuracy
    Dataset = pd.read_csv(LargeClassSmell)
    param_dist = {'max_depth': list(np.arange(1, 100, step=10)) + [None]}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]
    tree = DecisionTreeClassifier()
    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    tree_cv = RandomizedSearchCV(tree, param_dist, cv=rfolds, return_train_score=True, scoring=mcc)
    tree_cv.fit(X, Y)
    DT_Scores = cross_val_score(tree_cv.best_estimator_, X, Y, scoring=mcc, cv=rfolds)
    DT_accuracy = cross_val_score(tree_cv.best_estimator_, X, Y, scoring='accuracy', cv=rfolds)
    print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
    print("Best MCC score is {}".format(DT_Scores.mean()))
    print("Best accuracy is {}".format(DT_accuracy.mean()))

## Random Forest


In [None]:
def Random_Forest_k10(LargeClassSmell):
    global RF_Scores
    global RF_Accuracy
    Dataset = pd.read_csv(LargeClassSmell)
    param_grid = {'n_estimators': [100, 200, 300, 400, 500]}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]
    rf = RandomForestClassifier()
    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    rf_cv = RandomizedSearchCV(rf, param_grid, cv=rfolds, scoring=mcc,  n_iter=5)
    rf_cv.fit(X, Y)
    RF_Scores = cross_val_score(rf_cv.best_estimator_, X, Y, scoring=mcc, cv=rfolds)
    RF_Accuracy = cross_val_score(rf_cv.best_estimator_, X, Y, scoring='accuracy', cv=rfolds)
    print("Tuned Random Forest Parameters: {}".format(rf_cv.best_params_))
    print("Best MCC score is {}".format(RF_Scores.mean()))
    print("Best accuracy is {}".format(RF_Accuracy.mean()))


## Logistic Regression

In [None]:
def Logistic_Regression_k10(LargeClassSmell):
    global LR_Scores
    global LR_Accuracy
    Dataset = pd.read_csv(LargeClassSmell)
    Dataset = shuffle(Dataset, random_state=0)
    param_dist = {'C': np.arange(0, 1, 0.01)}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]
    lr = LogisticRegression()
    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    lr_cv = RandomizedSearchCV(lr, param_dist, cv=rfolds, scoring=mcc)
    lr_cv.fit(X, Y)
    LR_Scores = cross_val_score(lr_cv.best_estimator_, X, Y, scoring=mcc, cv=rfolds)
    LR_Accuracy = cross_val_score(lr_cv.best_estimator_, X, Y, scoring='accuracy', cv=rfolds)
    print("Tuned Logistic Regression Parameters: {}".format(lr_cv.best_params_))
    print("Best MCC score is {}".format(LR_Scores.mean()))
    print("Best accuracy is {}".format(LR_Accuracy.mean()))

## Support Vector Machine

In [None]:


def SVM_k10(LargeClassSmell):
    global SVM_Scores
    global SVM_Accuracy
    Dataset = pd.read_csv(LargeClassSmell)
    Dataset = shuffle(Dataset, random_state=0)
    param_grid = {'C': np.arange(0.1, 1.0, 0.1),
                  'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]

    svc = SVC()
    rfolds = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    svc_cv = GridSearchCV(svc, param_grid, cv=rfolds, scoring=mcc, n_jobs=-1)
    svc_cv.fit(X, Y)
    SVM_Scores = svc_cv.cv_results_['mean_test_score']
    SVM_Accuracy = svc_cv.cv_results_['mean_test_accuracy']
    print("Tuned SVM Parameters: {}".format(svc_cv.best_params_))
    print("Best MCC score is {}".format(svc_cv.best_score_))
    print("Best accuracy is {}".format(svc_cv.cv_results_['mean_test_accuracy'][svc_cv.best_index_]))




## Multi-Layer Perceptron

In [None]:
def Neural_Network_k10(LargeClassSmell):
    global NN_Scores
    global NN_Accuracy

    Dataset = pd.read_csv(LargeClassSmell)
    Dataset = shuffle(Dataset, random_state=0)

    X = Dataset.iloc[:, :-1]
    Y = Dataset.iloc[:, -1]

    mlp = MLPClassifier(max_iter=1000)  # You can adjust the value as needed

    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)

    # Fit the model and perform cross-validation
    NN_Scores = cross_val_score(mlp, X, Y, scoring=mcc, cv=rfolds)
    NN_Accuracy = cross_val_score(mlp, X, Y, scoring='accuracy', cv=rfolds)

    print("Best MCC score is {}".format(NN_Scores.mean()))
    print("Best accuracy is {}".format(NN_Accuracy.mean()))

## Stochastic Gradient Descent

In [None]:
def SGD_k10(LargeClassSmell):
    global SGD_Scores
    global SGD_Accuracy
    Dataset = pd.read_csv(LargeClassSmell)
    Dataset = shuffle(Dataset, random_state=0)
    param_dist = {'loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron'],
                  'penalty': ['none', 'l2', 'l1', 'elasticnet'],
                  'alpha': np.arange(0.0001, 0.01, 0.0001),
                  'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]
    sgd = SGDClassifier(max_iter=1000)
    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    sgd_cv = RandomizedSearchCV(sgd, param_dist, cv=rfolds, scoring=mcc)
    sgd_cv.fit(X, Y)
    SGD_Scores = cross_val_score(sgd_cv.best_estimator_, X, Y, scoring=mcc, cv=rfolds)
    SGD_Accuracy = cross_val_score(sgd_cv.best_estimator_, X, Y, scoring='accuracy', cv=rfolds)
    print("Tuned SGD Parameters: {}".format(sgd_cv.best_params_))
    print("Best MCC score is {}".format(SGD_Scores.mean()))
    print("Best accuracy is {}".format(SGD_Accuracy.mean()))


In [None]:
def wilcoxonTest(firstModelScore , secondModelScore):
    win , loss = " " , " "
    stat,p= wilcoxon(firstModelScore, secondModelScore, zero_method='zsplit')
    mean1 = firstModelScore.mean()
    mean2 =  secondModelScore.mean()
    alpha = 0.05
    if(p<=0.05):
        if(mean1>mean2):
            win = "Model1"
            loss = "Model2"
        else:
            win = "Model2"
            loss = "Model1"

    wilcoxonResults = {"winner: " : win , "losser: " : loss , "statics: ": stat , "p-Value: ": p }
    return wilcoxonResults

# Large Class Dataset

In [None]:
Decision_Tree_k10('/content/drive/MyDrive/Colab files/Python_LargeClassSmell_Dataset.csv')


Tuned Decision Tree Parameters: {'max_depth': 1}
Best MCC score is 0.8935090117309602
Best accuracy is 0.9558672086720865


In [None]:
Random_Forest_k10('/content/drive/MyDrive/Colab files/Python_LargeClassSmell_Dataset.csv')

Tuned Random Forest Parameters: {'n_estimators': 400}
Best MCC score is 0.8756300353380578
Best accuracy is 0.951225534477567


In [None]:
Logistic_Regression_k10('/content/drive/MyDrive/Colab files/Python_LargeClassSmell_Dataset.csv')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html


Tuned Logistic Regression Parameters: {'C': 0.14}
Best MCC score is 0.678384242311989
Best accuracy is 0.8821168322794339


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
SVM_k10('/content/drive/MyDrive/Colab files/Python_LargeClassSmell_Dataset.csv')

In [None]:
Neural_Network_k10('/content/drive/MyDrive/Colab files/Python_LargeClassSmell_Dataset.csv')


In [None]:
SGD_k10('/content/drive/MyDrive/Colab files/Python_LargeClassSmell_Dataset.csv')

## Large Class Wilcoxon Test

In [None]:
wilcoxonTest(DT_Scores, RF_Scores)

In [None]:
wilcoxonTest(DT_Scores,LR_Scores)

In [None]:
wilcoxonTest(DT_Scores,SVM_Scores)

In [None]:
wilcoxonTest(DT_Scores,NN_Scores)

In [None]:
wilcoxonTest(DT_Scores,SGD_Scores['test_score'])

In [None]:
wilcoxonTest(RF_Scores,LR_Scores)

In [None]:
wilcoxonTest(RF_Scores,SVM_Scores)

In [None]:
wilcoxonTest(RF_Scores,NN_Scores)

In [None]:
wilcoxonTest(RF_Scores,SGD_Scores['test_score'])

In [None]:
wilcoxonTest(LR_Scores,SVM_Scores)

In [None]:
wilcoxonTest(LR_Scores,NN_Scores)

In [None]:
wilcoxonTest(LR_Scores,SGD_Scores['test_score'])

In [None]:
wilcoxonTest(SVM_Scores,NN_Scores)

In [None]:
wilcoxonTest(SVM_Scores,SGD_Scores['test_score'])

In [None]:
wilcoxonTest(NN_Scores,SGD_Scores['test_score'])

# Example Wilcoxon test results


In [None]:

wilcoxon_result_large_class = wilcoxonTest(DT_Scores, RF_Scores)

# Large Class Boxplot

In [None]:

df=pd.DataFrame({'DT':DT_Scores,'RF':RF_Scores,'LR':LR_Scores,'SVM':SVM_Scores
                ,'MLP':NN_Scores,'SGD':SGD_Scores['test_score']} )

In [None]:

colors = ['#78C850', '#6890F0', '#F8D030', '#F85888', '#705898', '#98D8D8']
boxplot = sns.boxplot(x="variable", y="value", hue="variable", data=pd.melt(df), palette="Pastel1", legend=False)

boxplot.axes.set_title("Hyperparameter tuning using GridSearchCV\nLarge Class", fontsize=14)
boxplot.set_xlabel("Classifier", fontsize=14)
boxplot.set_ylabel("MCC Score", fontsize=14)

plt.show()


In [None]:

df=pd.DataFrame({'DT':DT_accuracy,'RF':RF_Accuracy,'LR':LR_Accuracy,'SVM':SVM_Accuracy
                ,'MLP':NN_Accuracy,'SGD':SGD_Accuracy} )

In [None]:
boxplot = sns.boxplot(x="variable", y="value", hue="variable", data=pd.melt(df), palette="Pastel1", legend=False)
boxplot.axes.set_title("Hyperparameter tuning using GridSearchCV\nLarge Class", fontsize=14)
boxplot.set_xlabel("Classifier", fontsize=14)
boxplot.set_ylabel("Accuracy", fontsize=14)
plt.show()

In [None]:
# Example boxplot data

df_large_class = pd.DataFrame({'DT': DT_Scores, 'RF': RF_Scores, 'LR': LR_Scores})


# Long Method Dataset

In [None]:
Long = "content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv"

In [None]:
def LDecision_Tree_k10(Long):
    global LDT_Scores
    global LDT_accuracy
    Dataset = pd.read_csv('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')
    param_dist = {'max_depth': list(np.arange(1, 100, step=10)) + [None]}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]
    tree = DecisionTreeClassifier()
    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    tree_cv = RandomizedSearchCV(tree, param_dist, cv=rfolds, return_train_score=True, scoring=mcc)
    tree_cv.fit(X, Y)
    DT_Scores = cross_val_score(tree_cv.best_estimator_, X, Y, scoring=mcc, cv=rfolds)
    DT_accuracy = cross_val_score(tree_cv.best_estimator_, X, Y, scoring='accuracy', cv=rfolds)
    print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
    print("Best MCC score is {}".format(LDT_Scores.mean()))
    print("Best accuracy is {}".format(LDT_accuracy.mean()))

## Random Forest


In [None]:
def LRandom_Forest_k10(Long):
    global LRF_Scores
    global LRF_Accuracy
    Dataset = pd.read_csv('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')
    param_grid = {'n_estimators': [100, 200, 300, 400, 500]}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]
    rf = RandomForestClassifier()
    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    rf_cv = RandomizedSearchCV(rf, param_grid, cv=rfolds, scoring=mcc,  n_iter=5)
    rf_cv.fit(X, Y)
    RF_Scores = cross_val_score(rf_cv.best_estimator_, X, Y, scoring=mcc, cv=rfolds)
    RF_Accuracy = cross_val_score(rf_cv.best_estimator_, X, Y, scoring='accuracy', cv=rfolds)
    print("Tuned Random Forest Parameters: {}".format(rf_cv.best_params_))
    print("Best MCC score is {}".format(LRF_Scores.mean()))
    print("Best accuracy is {}".format(LRF_Accuracy.mean()))


## Logistic Regression

In [None]:
def LLogistic_Regression_k10(Long):
    global LLR_Scores
    global LLR_Accuracy
    Dataset = pd.read_csv('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')
    Dataset = shuffle(Dataset, random_state=0)
    param_dist = {'C': np.arange(0, 1, 0.01)}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]
    lr = LogisticRegression()
    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    lr_cv = RandomizedSearchCV(lr, param_dist, cv=rfolds, scoring=mcc)
    lr_cv.fit(X, Y)
    LR_Scores = cross_val_score(lr_cv.best_estimator_, X, Y, scoring=mcc, cv=rfolds)
    LR_Accuracy = cross_val_score(lr_cv.best_estimator_, X, Y, scoring='accuracy', cv=rfolds)
    print("Tuned Logistic Regression Parameters: {}".format(lr_cv.best_params_))
    print("Best MCC score is {}".format(LLR_Scores.mean()))
    print("Best accuracy is {}".format(LLR_Accuracy.mean()))

## Support Vector Machine

In [None]:
def LSVM_k10(Long):
    global LSVM_Scores
    global LSVM_Accuracy
    Dataset = pd.read_csv('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')
    Dataset = shuffle(Dataset, random_state=0)
    param_dist = {"C": [.01, .1, 1, 5, 10, 100], "gamma": [0, .01, .1, 1, 5, 10, 100]}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]
    svm_model = svm.SVC(kernel='rbf')
    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    svm_cv = RandomizedSearchCV(svm_model, param_dist, cv=rfolds, scoring=mcc)
    svm_cv.fit(X, Y)
    SVM_Scores = cross_val_score(svm_cv.best_estimator_, X, Y, scoring=mcc, cv=rfolds)
    SVM_Accuracy = cross_val_score(svm_cv.best_estimator_, X, Y, scoring='accuracy', cv=rfolds)
    print("Tuned SVM Parameters: {}".format(svm_cv.best_params_))
    print("Best MCC score is {}".format(LSVM_Scores.mean()))
    print("Best accuracy is {}".format(LSVM_Accuracy.mean()))

## Multi-Layer Perceptron

In [None]:
def LNeural_Network_k10(Long):
    global LNN_Scores
    global LNN_Accuracy
    Dataset = pd.read_csv('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')
    param_dist = {'hidden_layer_sizes': [(10,), (50,), (100,), (10, 10), (50, 50), (100, 100)],
                  'activation': ['logistic', 'tanh', 'relu'],
                  'solver': ['lbfgs', 'sgd', 'adam']}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]
    nn = MLPClassifier(max_iter=1000)
    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    nn_cv = RandomizedSearchCV(nn, param_dist, cv=rfolds, scoring=mcc)
    nn_cv.fit(X, Y)
    NN_Scores = cross_val_score(nn_cv.best_estimator_, X, Y, scoring=mcc, cv=rfolds)
    NN_Accuracy = cross_val_score(nn_cv.best_estimator_, X, Y, scoring='accuracy', cv=rfolds)
    print("Tuned Neural Network Parameters: {}".format(nn_cv.best_params_))
    print("Best MCC score is {}".format(LNN_Scores.mean()))
    print("Best accuracy is {}".format(LNN_Accuracy.mean()))

## Stochastic Gradient Descent

In [None]:
def LSGD_k10(Long):
    global LSGD_Scores
    global LSGD_Accuracy
    Dataset = pd.read_csv('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')
    Dataset = shuffle(Dataset, random_state=0)
    param_dist = {'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                  'penalty': ['none', 'l2', 'l1', 'elasticnet'],
                  'alpha': np.arange(0.0001, 0.01, 0.0001),
                  'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']}
    X = Dataset.iloc[:, 0:-1]
    Y = Dataset.iloc[:, -1]
    sgd = SGDClassifier(max_iter=1000)
    rfolds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
    mcc = make_scorer(matthews_corrcoef)
    sgd_cv = RandomizedSearchCV(sgd, param_dist, cv=rfolds, scoring=mcc)
    sgd_cv.fit(X, Y)
    SGD_Scores = cross_val_score(sgd_cv.best_estimator_, X, Y, scoring=mcc, cv=rfolds)
    SGD_Accuracy = cross_val_score(sgd_cv.best_estimator_, X, Y, scoring='accuracy', cv=rfolds)
    print("Tuned SGD Parameters: {}".format(sgd_cv.best_params_))
    print("Best MCC score is {}".format(LSGD_Scores.mean()))
    print("Best accuracy is {}".format(LSGD_Accuracy.mean()))

In [None]:
LDecision_Tree_k10('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')

In [None]:
LRandom_Forest_k10('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')

In [None]:
LLogistic_Regression_k10('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')

In [None]:
LSVM_k10('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')

In [None]:
LNeural_Network_k10('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')

In [None]:
LSGD_k10('/content/drive/MyDrive/Colab files/Python_LongMethodSmell_Dataset.csv')

## Long Method Wilcoxon Test

In [None]:
wilcoxonTest(LDT_Scores,LRF_Scores)

In [None]:
wilcoxonTest(LDT_Scores,LLR_Scores)

In [None]:
wilcoxonTest(LDT_Scores,LSVM_Scores)

In [None]:
wilcoxonTest(LDT_Scores,LNN_Scores)

In [None]:
wilcoxonTest(LDT_Scores,LSGD_Scores['test_score'])

In [None]:
wilcoxonTest(LRF_Scores,LLR_Scores)

In [None]:
wilcoxonTest(LRF_Scores,LSVM_Scores)

In [None]:
wilcoxonTest(LRF_Scores,LNN_Scores)

In [None]:
wilcoxonTest(LRF_Scores,LSGD_Scores['test_score'])

In [None]:
wilcoxonTest(LLR_Scores,LSVM_Scores)

In [None]:
wilcoxonTest(LLR_Scores,LNN_Scores)

In [None]:
wilcoxonTest(LLR_Scores,LSGD_Scores['test_score'])

In [None]:
wilcoxonTest(LSVM_Scores,LNN_Scores)

In [None]:
wilcoxonTest(LSVM_Scores,LSGD_Scores['test_score'])

In [None]:
wilcoxonTest(LNN_Scores,LSGD_Scores['test_score'])

# Example Wilcoxon test results


In [None]:

wilcoxon_result_long_method = wilcoxonTest(LDT_Scores, LLR_Scores)


# Long Method Boxplot

In [None]:

df=pd.DataFrame({'DT':LDT_Scores,'RF':LRF_Scores,'LR':LLR_Scores,'SVM':LSVM_Scores
                ,'MLP':LNN_Scores,'SGD':LSGD_Scores['test_score']} )

In [None]:
boxplot = sns.boxplot(x="variable", y="value", hue="variable", data=pd.melt(df), palette="Pastel1", legend=False)
boxplot.axes.set_title("Hyperparameter tuning using GridSearchCV\nLong Method", fontsize=14)
boxplot.set_xlabel("Classifier", fontsize=14)
boxplot.set_ylabel("MCC Score", fontsize=14)
plt.show()

In [None]:

df=pd.DataFrame({'DT':LDT_accuracy,'RF':LRF_Accuracy,'LR':LLR_Accuracy,'SVM':LSVM_Accuracy
                ,'MLP':LNN_Accuracy,'SGD':LSGD_Accuracy} )

In [None]:
boxplot = sns.boxplot(x="variable", y="value", hue="variable", data=pd.melt(df), palette="Pastel1", legend=False)
boxplot.axes.set_title("Hyperparameter tuning using GridSearchCV\nLong Method", fontsize=14)
boxplot.set_xlabel("Classifier", fontsize=14)
boxplot.set_ylabel("Accuracy", fontsize=14)
plt.show()

In [None]:
df_long_method = pd.DataFrame({'DT': LDT_Scores, 'RF': LRF_Scores, 'LR': LLR_Scores})


In [None]:
def compare_datasets(wilcoxon_result, df):
    print(f"Wilcoxon Test Results: {wilcoxon_result}")


    print(f"Keys in wilcoxon_result: {wilcoxon_result.keys()}")


    p_value_key = 'p-Value'
    if p_value_key in wilcoxon_result and wilcoxon_result[p_value_key] < 0.05:
        winner = wilcoxon_result['winner:']
        print(f"\nWinner based on Wilcoxon Test: {winner}")

        medians = df.median()
        best_classifier = medians.idxmax()
        print(f"Classifier with the highest median: {best_classifier}")

        boxplot = sns.boxplot(x="variable", y="value", hue="variable", data=pd.melt(df), palette="Pastel1")
        boxplot.axes.set_title("Boxplot Comparison", fontsize=14)
        boxplot.set_xlabel("Classifier", fontsize=14)
        boxplot.set_ylabel("MCC Score", fontsize=14)
        plt.show()
    else:
        print("\nNo statistically significant difference.")


## Boxplot for MCC Scores & Accuracy for both Large and Long

In [None]:
df_large_class_mcc = pd.DataFrame({
    'Decision Tree': DT_Scores,
    'Random Forest': RF_Scores,
    'Logistic Regression': LR_Scores,
    'SVM': SVM_Scores,
    'MLP': MLP_Scores,
    'SGD': SGD_Scores['test_score']
})

plt.figure(figsize=(12, 6))
boxplot_large_class_mcc = sns.boxplot(x="variable", y="value", hue="variable", data=pd.melt(df_large_class_mcc), palette="Pastel1")
boxplot_large_class_mcc.set_title("Large Class - MCC Scores", fontsize=14)
boxplot_large_class_mcc.set_xlabel("Classifier", fontsize=14)
boxplot_large_class_mcc.set_ylabel("MCC Score", fontsize=14)
plt.show()

# Boxplot for Large Class Accuracy
df_large_class_accuracy = pd.DataFrame({
    'Decision Tree': DT_accuracy,
    'Random Forest': RF_Accuracy,
    'Logistic Regression': LR_Accuracy,
    'SVM': SVM_Accuracy,
    'MLP': MLP_Accuracy,
    'SGD': SGD_Accuracy
})

plt.figure(figsize=(12, 6))
boxplot_large_class_accuracy = sns.boxplot(x="variable", y="value", hue="variable", data=pd.melt(df_large_class_accuracy), palette="Pastel1")
boxplot_large_class_accuracy.set_title("Large Class - Accuracy", fontsize=14)
boxplot_large_class_accuracy.set_xlabel("Classifier", fontsize=14)
boxplot_large_class_accuracy.set_ylabel("Accuracy", fontsize=14)
plt.show()

df_long_method_mcc = pd.DataFrame({
    'Decision Tree': LDT_Scores,
    'Random Forest': LRF_Scores,
    'Logistic Regression': LLR_Scores,
    'SVM': LSVM_Scores,
    'MLP': LMLP_Scores,
    'SGD': LSGD_Scores['test_score']
})

plt.figure(figsize=(12, 6))
boxplot_long_method_mcc = sns.boxplot(x="variable", y="value", hue="variable", data=pd.melt(df_long_method_mcc), palette="Pastel1")
boxplot_long_method_mcc.set_title("Long Method - MCC Scores", fontsize=14)
boxplot_long_method_mcc.set_xlabel("Classifier", fontsize=14)
boxplot_long_method_mcc.set_ylabel("MCC Score", fontsize=14)
plt.show()

df_long_method_accuracy = pd.DataFrame({
    'Decision Tree': LDT_accuracy,
    'Random Forest': LRF_Accuracy,
    'Logistic Regression': LLR_Accuracy,
    'SVM': LSVM_Accuracy,
    'MLP': LMLP_Accuracy,
    'SGD': LSGD_Accuracy
})

plt.figure(figsize=(12, 6))
boxplot_long_method_accuracy = sns.boxplot(x="variable", y="value", hue="variable", data=pd.melt(df_long_method_accuracy), palette="Pastel1")
boxplot_long_method_accuracy.set_title("Long Method - Accuracy", fontsize=14)
boxplot_long_method_accuracy.set_xlabel("Classifier", fontsize=14)
boxplot_long_method_accuracy.set_ylabel("Accuracy", fontsize=14)
plt.show()


# Average  MCC Scores & Accuracy


In [None]:
average_mcc_scores = {
    'Decision Tree': np.mean([DT_Scores.mean(), LDT_Scores.mean()]),
    'Random Forest': np.mean([RF_Scores.mean(), LRF_Scores.mean()]),
    'Logistic Regression': np.mean([LR_Scores.mean(), LLR_Scores.mean()]),
    'SVM': np.mean([SVM_Scores.mean(), LSVM_Scores.mean()]),
    'MLP': np.mean([MLP_Scores.mean(), LMLP_Scores.mean()]),
    'SGD': np.mean([SGD_Scores['test_score'].mean(), LSGD_Scores['test_score'].mean()])
}

average_accuracies = {
    'Decision Tree': np.mean([DT_accuracy.mean(), LDT_accuracy.mean()]),
    'Random Forest': np.mean([RF_Accuracy.mean(), LRF_Accuracy.mean()]),
    'Logistic Regression': np.mean([LR_Accuracy.mean(), LLR_Accuracy.mean()]),
    'SVM': np.mean([SVM_Accuracy.mean(), LSVM_Accuracy.mean()]),
    'MLP': np.mean([MLP_Accuracy.mean(), LMLP_Accuracy.mean()]),
    'SGD': np.mean([SGD_Accuracy.mean(), LSGD_Accuracy.mean()])
}

best_classifier_mcc = max(average_mcc_scores, key=average_mcc_scores.get)
best_classifier_accuracy = max(average_accuracies, key=average_accuracies.get)

print("Best Classifier based on MCC Scores:", best_classifier_mcc)
print("Best Classifier based on Accuracy:", best_classifier_accuracy)
