In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, SelectPercentile
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

In [8]:
def show_performance_metrics(y_test, y_test_predicted):
    print('PERFORMANCE METRICS')
    print(f"Accuracy: {accuracy_score(y_test, y_test_predicted):.2f} | "
          f"Precision: {precision_score(y_test, y_test_predicted):.2f} | "
          f"Recall: {recall_score(y_test, y_test_predicted):.2f} | "
          f"F1: {f1_score(y_test, y_test_predicted):.2f} ")
    print('Confusion matrix: \n {0}'.format(confusion_matrix(y_test, y_test_predicted)))
    

def show_cv_scores(tree, X_train, y_train, cv = 5):
    cv_scores = cross_val_score(tree, X_train, np.ravel(y_train), cv = cv)
    print("CV SCORES")
    print(cv_scores)
    print( "Accuracy cv: {0: 0.2f}, Std: {1: 0.2f}".format(np.mean(cv_scores), np.std(cv_scores)) )  


def add_columns_names(X_new, X):
    selected_features_names = [
        X.columns[j]
        for i in range(len(X_new.columns))
        for j in range(len(X.columns))
        if X_new.iloc[:, i].equals(X.iloc[:, j])
    ]
    
    X_new.columns = selected_features_names
    return X_new


def get_X_with_select_features_by_mutual_information(X, y, percentile = 30):
    select_features = SelectPercentile(mutual_info_classif, percentile = percentile)

    X_new = select_features.fit_transform(X, y)
    X_new = pd.DataFrame(X_new)
    return add_columns_names(X_new, X)


def train_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
    model.fit(X_train, y_train)

    y_test_predicted = model.predict(X_test)

    show_performance_metrics(y_test, y_test_predicted)
    show_cv_scores(model, X_train, y_train, 5)


def train_model_from_file(file_name):
    breast_cancer_df = pd.read_csv(file_name)

    X = breast_cancer_df.drop(columns = ['diagnosis'])
    y = breast_cancer_df['diagnosis']
    X_selected_by_mutual_information = get_X_with_select_features_by_mutual_information(X, y, 30)

    best_model = RandomForestClassifier(class_weight='balanced', random_state = 42, n_estimators = 100, max_depth = None, min_samples_split = 2)

    train_model(X_selected_by_mutual_information, y, best_model)

train_model_from_file("breast_cancer.csv")

PERFORMANCE METRICS
Accuracy: 0.96 | Precision: 0.95 | Recall: 0.95 | F1: 0.95 
Confusion matrix: 
 [[105   3]
 [  3  60]]
CV SCORES
[0.9375     0.9375     0.9375     0.94936709 0.93670886]
Accuracy cv:  0.94, Std:  0.00
