# Decision tree classifier

In [1]:
from statistics import mean
import shutil
import os
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

#to store mean of results for each smell
results = []

path = 'images_decision_tree'
# Check whether the specified path exists or not
isExist = os.path.exists(path)
if isExist:
    shutil.rmtree(path)

os.mkdir(path)

#######################

os.chdir('datasets/stratifiedKfold')

#folder smell
for i in os.listdir():
    if '.DS_Store' not in i:
        os.chdir(i)
        print("\n===================== "+i+" =====================")

        path = '../../../images_decision_tree/'+i
        # Check whether the specified path exists or not
        isExist = os.path.exists(path)
        if isExist:
            shutil.rmtree(path)

        os.mkdir(path)

        f_measure_smell = []
        f_measure_not_smell = []

        for fold in os.listdir():
            if '.DS_Store' not in fold:
                os.chdir(fold)

                train_set_over = pd.read_csv("train_set_oversampled.csv")
                test_set = pd.read_csv("test_set.csv")

                #split data and label
                X_train = train_set_over.iloc[:, :-1]
                y_train = train_set_over.iloc[:, -1:]

                X_test = test_set.iloc[:, 1:-1]
                y_test = test_set.iloc[:, -1:]

                features_name = X_train.columns.values.tolist()
                class_name = y_train.columns.values.tolist()

                print("\n\t----------------- "+fold+" -----------------\n")

                #nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
                classifier = DecisionTreeClassifier()
                print("\tTrain size:",len(X_train), ", Test size:", len(X_test))

                decision_tree = classifier.fit(X_train, y_train)


                acc_score = classifier.score(X_test, y_test)
                print("\n\tAccuracy:", acc_score)

                y_pred = classifier.predict(X_test)
                y_pred = pd.DataFrame(y_pred, columns = [class_name[0]])
                y_pred.to_csv('pred_set.csv', index=False)


                print("\tCorrect labels:", accuracy_score(y_pred, y_test, normalize=False), "of", len(X_test))

                per_class_precision = precision_score(y_pred, y_test, average=None)
                print('\n\tPrecision score:', per_class_precision)

                recall = recall_score(y_pred, y_test, average=None)
                print('\tRecall score:', recall)

                F_measure = f1_score(y_pred, y_test, average=None)
                print('\tF-Measure:', F_measure)
                f_measure_not_smell.append(F_measure[0])
                f_measure_smell.append(F_measure[1])

                #save image
                fig = plt.figure(figsize=(100,100))
                fig1 = tree.plot_tree(decision_tree,
                       feature_names=features_name,
                       class_names=['Not' + class_name[0], class_name[0]],
                       filled=True)
                plt.close(fig)
                fig.savefig('../'+path+ '/'+class_name[0]+"_"+fold+".png")

                os.chdir('..')
        os.chdir('..')

        results.append([i,'['+str(mean(f_measure_not_smell))+', '+str(mean(f_measure_smell))+']'])
os.chdir('..')


	----------------- 9 -----------------

	Train size: 132990 , Test size: 7391

	Accuracy: 0.9995941009335678
	Correct labels: 7388 of 7391

	Precision score: [0.99959394 1.        ]
	Recall score: [1.  0.5]
	F-Measure: [0.99979693 0.66666667]

	----------------- 7 -----------------

	Train size: 132990 , Test size: 7391

	Accuracy: 1.0
	Correct labels: 7391 of 7391

	Precision score: [1. 1.]
	Recall score: [1. 1.]
	F-Measure: [1. 1.]

	----------------- 6 -----------------

	Train size: 132990 , Test size: 7391

	Accuracy: 1.0
	Correct labels: 7391 of 7391

	Precision score: [1. 1.]
	Recall score: [1. 1.]
	F-Measure: [1. 1.]

	----------------- 1 -----------------

	Train size: 132988 , Test size: 7391

	Accuracy: 0.9995941009335678
	Correct labels: 7388 of 7391

	Precision score: [0.99986466 0.        ]
	Recall score: [0.99972936 0.        ]
	F-Measure: [0.99979701 0.        ]

	----------------- 10 -----------------

	Train size: 132990 , Test size: 7391

	Accuracy: 0.99824110404546

In [3]:
for result in results:
    print(result[0]+": "+result[1])

LazyClass: [0.9996135998804423, 0.6595464270432178]
SpaghettiCode: [0.9961684909969379, 0.6021941359014766]
RefusedBequest: [0.9986866947402058, 0.7252274048473473]
LargeClass: [0.9990754839258026, 0.6859146996794055]
ComplexClass: [0.9991983774485352, 0.6584893875563247]
