## Project: Prediction and Classification of Breast cancer

### Author: Munezero Mihigo

### Date: 13 November 2021

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
    
from sklearn.datasets import load_breast_cancer

In [None]:
# Print the data set description
cancer = load_breast_cancer()
print(cancer.DESCR) 

In [None]:
# Colums in the data set
cancer.keys()

In [None]:
# This function returns the number of features of the breast cancer dataset

def answer_zero():

    return len(cancer['feature_names'])

answer_zero() 

In [None]:
def answer_one():
    import pandas as pd
    import numpy as np
    from sklearn.datasets import load_breast_cancer

    cancer = load_breast_cancer()
    df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                      columns= np.append(cancer['feature_names'], ['target']))
    return df 

answer_one()

In [None]:
def answer_two():
    cancerdf = answer_one()
    return cancerdf['target'].value_counts()
      
answer_two()

In [None]:
def answer_three():
    cancerdf = answer_one()
    X = cancerdf.iloc[:, :-1]
    y=cancerdf.iloc[:, -1]   
    return X, y

answer_three()

In [None]:
from sklearn.model_selection import train_test_split

def answer_four():
    X, y = answer_three()
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)  
    
    return X_train, X_test, y_train, y_test
answer_four()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def answer_five():
    X_train, X_test, y_train, y_test = answer_four()    
    knn = KNeighborsClassifier(n_neighbors=1) 
    knn.fit(X_train, y_train)
    return knn
answer_five()

In [None]:
def answer_six():
    cancerdf = answer_one()
    means = cancerdf.mean()[:-1].values.reshape(1, -1)
    X_train, X_test, y_train, y_test = answer_four()   
    knn=answer_five()
    knn.fit(X_train, y_train)
    return knn.predict(means)
answer_six()

In [None]:
def answer_seven():
    X_train, X_test, y_train, y_test = answer_four()
    knn = answer_five()    
    knn.fit(X_train, y_train)    
    return knn.predict(X_test)
answer_seven()

In [None]:
def answer_eight():
    X_train, X_test, y_train, y_test = answer_four()
    knn = answer_five()   
    knn.fit(X_train, y_train)
    return knn.score(X_test, y_test)
answer_eight()

In [None]:
def accuracy_plot():
    import matplotlib.pyplot as plt

    %matplotlib notebook

    X_train, X_test, y_train, y_test = answer_four()

    # Find the training and testing accuracies by target value (i.e. malignant, benign)
    mal_train_X = X_train[y_train==0]
    mal_train_y = y_train[y_train==0]
    ben_train_X = X_train[y_train==1]
    ben_train_y = y_train[y_train==1]

    mal_test_X = X_test[y_test==0]
    mal_test_y = y_test[y_test==0]
    ben_test_X = X_test[y_test==1]
    ben_test_y = y_test[y_test==1]

    knn = answer_five()
    knn.fit(X_train, y_train)

    scores = [knn.score(mal_train_X, mal_train_y), knn.score(ben_train_X, ben_train_y), knn.score(mal_test_X, mal_test_y), knn.score(ben_test_X, ben_test_y)]

    
    fig = plt.figure(figsize=(12,8))

    # Plot the scores as a bar chart
    bars = plt.bar(np.arange(4), scores, color=['#4c72b0','#4c72b0','#55a868','#55a868'])

    # directly label the score onto the bars
    for bar in bars:
        height = bar.get_height()
        plt.gca().text(bar.get_x() + bar.get_width()/2, height*.90, '{0:.{1}f}'.format(height, 2), 
                     ha='center', color='w', fontsize=11)

    # remove all the ticks (both axes), and tick labels on the Y axis
    plt.tick_params(top='off', bottom='off', left='off', right='off', labelleft='off', labelbottom='on')

    # remove the frame of the chart
    for spine in plt.gca().spines.values():
        spine.set_visible(False)

    plt.xticks([0,1,2,3], ['Malignant\nTraining', 'Benign\nTraining', 'Malignant\nTest', 'Benign\nTest'], alpha=0.8);
    plt.title('Training and Test Accuracies for Malignant and Benign Cells', alpha=0.8)
    
    # Saving the plot as an image
    fig.savefig('../Plots/Training and Test Accuracies for Malignant and Benign Cells.jpg', bbox_inches='tight', dpi=150)
    

In [None]:
accuracy_plot()