In [31]:
!pip install --user -r requirements.txt



In [32]:
import json
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

def download_data(out_file):

    # Gets and split dataset
    x, y = load_breast_cancer(return_X_y=True)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    # Creates `data` structure to save and 
    # share train and test datasets.
    data = {'x_train' : x_train.tolist(),
            'y_train' : y_train.tolist(),
            'x_test' : x_test.tolist(),
            'y_test' : y_test.tolist()}

    # Creates a json object based on `data`
    data_json = json.dumps(data)


    # Saves the json object into a file
    with open(out_file, 'w') as f:
        json.dump(data_json, f)

In [33]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from joblib import dump

def decision_tree(data_file):

    # Open and reads file "data"
    with open(data_file) as f:
        data = json.load(f)
    
    # The excted data type is 'dict', however since the file
    # was loaded as a json object, it is first loaded as a string
    # thus we need to load again from such string in order to get 
    # the dict-type object.
    data = json.loads(data)

    x_train = data['x_train']
    y_train = data['y_train']
    x_test = data['x_test']
    y_test = data['y_test']
    
    # Initialize and train the model
    model = DecisionTreeClassifier(max_depth=3)
    model.fit(x_train, y_train)
    
    # Save the model
    dump(model, 'models/decision_tree_model.joblib')

    # Get predictions
    y_pred = model.predict(x_test)
    
    # Get accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [34]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from joblib import dump

def random_forest(data_file):

    # Open and reads file "data"
    with open(data_file) as f:
        data = json.load(f)
    
    # The excted data type is 'dict', however since the file
    # was loaded as a json object, it is first loaded as a string
    # thus we need to load again from such string in order to get 
    # the dict-type object.
    data = json.loads(data)

    x_train = data['x_train']
    y_train = data['y_train']
    x_test = data['x_test']
    y_test = data['y_test']
    
    # Initialize and train the model
    model = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt')
    model.fit(x_train, y_train)
    
    # Save the model
    dump(model, 'models/random_forest_model.joblib')

    # Get predictions
    y_pred = model.predict(x_test)
    
    # Get accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [35]:
def show_results(accuracy_file, accuracy_dt, accuracy_rf):
    # Given the outputs from decision_tree and logistic regression components
    # the results are shown.
    # Save output into file
    with open(accuracy_file, 'w') as f:
        f.write('Decision tree (accuracy): ' + str(accuracy_dt))
        f.write('Random forest (accuracy): ' + str(accuracy_rf))

    print(f"Decision tree (accuracy): {accuracy_dt}")
    print(f"Random forest (accuracy): {accuracy_rf}")
    f.close()

In [36]:
def classify():
    data_file = 'data/dataset.csv'
    accuracy_file = 'results/accuracy.txt'
    # Run download_data task
    download_data(data_file)

    # Run tasks "decison_tree" and "logistic_regression" given
    # the output generated by "download".
    accuracy_dt = decision_tree(data_file)
    accuracy_rf = random_forest(data_file)

    # Given the outputs from "decision_tree" and "logistic_regression"
    # the component "show_results" is called to print the results.
    show_results(accuracy_file, accuracy_dt, accuracy_rf)

In [37]:
classify()

Decision tree (accuracy): 0.9385964912280702
Random forest (accuracy): 0.9824561403508771
