# Decision tree classifier

In [1]:
import shutil
import os
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.tree import export_text
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


#prediction_array = []
#test_array = []

path = 'images_decision_tree'

# Check whether the specified path exists or not
isExist = os.path.exists(path)
if isExist:
    shutil.rmtree(path)

os.mkdir(path)

#######################

path = 'datasets/training_phase'

# Check whether the specified path exists or not
isExist = os.path.exists(path)
if isExist:
    shutil.rmtree(path)

os.mkdir(path)

############################

os.chdir('datasets/oversampling')

for i in os.listdir():
    dataset = pd.read_csv(i)

    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1:]

    features_name = X.columns.values.tolist()
    class_name = y.columns.values.tolist()

    #print("Features_name:", features_name )
    #print("Class_name:", class_name[0] )
    print("\n===================== "+class_name[0]+" =====================\n")


    classifier = DecisionTreeClassifier(max_depth=10)

    #split data and label
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
    print("Train size:",len(X_train), ", Test size:", len(X_test))

    #test_array.append(y_test)

    decision_tree = classifier.fit(X_train, y_train)

    #r = export_text(
     #   decision_tree)  #, feature_names=['ComplexClass','LazyClass','LargeClass', 'RefusedBequest', 'SpaghettiCode'])


    fig = plt.figure(figsize=(100,100))
    fig1 = tree.plot_tree(decision_tree,
                       feature_names=features_name,
                       class_names=['Not' + class_name[0], class_name[0]],
                       filled=True)
    plt.close(fig)
    fig.savefig("../../images_decision_tree/decision_tree_"+class_name[0]+".png")


    acc_score = classifier.score(X_test, y_test)
    print("\nAccuracy:", acc_score)
    #print(r)

    y_pred = classifier.predict(X_test)
    #prediction_array.append(y_pred)

    #save------------------------------
    path = '../training_phase/'+class_name[0]

    # Check whether the specified path exists or not
    isExist = os.path.exists(path)
    if isExist:
        shutil.rmtree(path)

    os.mkdir(path)

    train_set = pd.concat([X_train, y_train], axis=1)
    train_set = train_set.reset_index()  # make sure indexes pair with number of rows

    test_set = pd.concat([X_test, y_test], axis=1)
    test_set = test_set.reset_index()

    train_set.to_csv('../training_phase/'+class_name[0]+'/train_set_'+class_name[0]+'.csv', index=False)
    test_set.to_csv('../training_phase/'+class_name[0]+'/test_set_'+class_name[0]+'.csv', index=False)

    y_pred = pd.DataFrame(y_pred, columns = [class_name[0]])
    y_pred.to_csv('../training_phase/'+class_name[0]+'/pred_set_'+class_name[0]+'.csv', index=False)




    print("Correct labels:", accuracy_score(y_pred, y_test, normalize=False), "of", len(X_test))


    per_class_precision = precision_score(y_pred, y_test, average=None)
    print('\nPrecision score:', per_class_precision)

    recall = recall_score(y_pred, y_test, average=None)
    print('Recall score:', recall)

    F1_measure = f1_score(y_pred, y_test, average=None)
    print('F1 score:', F1_measure)


os.chdir('../..')



Train size: 88615 , Test size: 59077

Accuracy: 0.9969869830898658
Correct labels: 58899 of 59077

Precision score: [0.99400427 0.99996617]
Recall score: [0.99996592 0.99404682]
F1 score: [0.99697618 0.99699771]


Train size: 88587 , Test size: 59059

Accuracy: 0.9987978123571344
Correct labels: 58988 of 59059

Precision score: [0.99759542 1.        ]
Recall score: [1.         0.99760159]
F1 score: [0.99879626 0.99879936]


Train size: 88659 , Test size: 59107

Accuracy: 0.9939262693082037
Correct labels: 58748 of 59107

Precision score: [0.98794079 0.9998986 ]
Recall score: [0.99989715 0.98810916]
F1 score: [0.99388301 0.99396892]


Train size: 88506 , Test size: 59004

Accuracy: 0.9863060131516508
Correct labels: 58196 of 59004

Precision score: [0.97261481 1.        ]
Recall score: [1.         0.97333949]
F1 score: [0.98611732 0.98648965]


Train size: 88213 , Test size: 58809

Accuracy: 0.9760920947474027
Correct labels: 57403 of 58809

Precision score: [0.9590872  0.99309641]
Re