In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns # for data visualization
%matplotlib inline

# Input data files are available in the "../input/" directory.

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
data = "https://storage.googleapis.com/kagglesdsdata/datasets/11282/15651/breast-cancer-wisconsin.data.txt?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240716%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240716T103556Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=231f521c42ffce1c473ca9fa955158c716e38e9d1e3b762851c23f661d01377320a02b1191d8ff3893f5c134da5b85276735b073085994f3f67ea7233609f81005d7640cfdc72a4423fb19d41263bff6dc67b99b7e5f67b05f48d47da3e9f293d35eb94b6ede16500b610abec2c7894aca88fb0ab852f3d21af9210c9005f8a5696280143432b16cfc2f64ecec1b2c7ba3014ad3b373587b11d7f862f43e76293b23f93848ea1a3944c40c4b5c26de4dff28fab3905efed7a2a037ceac52368c3efab1748c0c2e58accf4d1635e2c975a1f94eb10be938a0e037b0c95a1e8801dfb3ef2ce0a45d40481e8f39a76a41331998d8b20926226efd05eee58079522d"
df = pd.read_csv(data, header = None)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
#Rename Columns for Better Understanding

col_names = ['Id', 'Clump_thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape', 'Marginal_Adhesion', 
             'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class']

df.columns = col_names

df.columns

In [None]:
df.head()

In [None]:
#ID is irrelevant to the model, so drop it.

df.drop('Id', axis=1, inplace=True)

In [None]:
df.info()

In [None]:
df['Bare_Nuclei'] = pd.to_numeric(df['Bare_Nuclei'], errors='coerce')

In [None]:
df.dtypes

In [None]:
#Frequency Distribution of Given Values

for var in df.columns:
    print(df[var].value_counts())

In [None]:
#Checking missing values
df.isnull().sum()

In [None]:
#Checking NULL values
df.isna().sum()

In [None]:
df['Mitoses'].isna().sum()

In [None]:
df['Bare_Nuclei'].isna().sum()

In [None]:
# Divide the set into Data set and Training Set

df['Class'].value_counts()

In [None]:
# view summary statistics in numerical variables
print(round(df.describe(),2))

In [None]:
# plot histograms of the variables
plt.rcParams['figure.figsize']=(20,15)
df.plot(kind='hist', bins=10, subplots=True, layout=(5,2), sharex=False, sharey=False)
plt.show()

In [None]:
##correlation calculation
correlation = df.corr()

In [None]:
correlation['Class'].sort_values(ascending=False)

In [None]:
#Correlation of all attributes

plt.figure(figsize=(10,8))
plt.title('Correlation of Attributes with Class variable')
a = sns.heatmap(correlation, square=True, annot=True, fmt='.2f', linecolor='grey')
a.set_xticklabels(a.get_xticklabels(), rotation=90)
a.set_yticklabels(a.get_yticklabels(), rotation=30)           
plt.show()

In [None]:
X = df.drop(['Class'], axis=1)
y = df['Class']

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
X_train.shape, X_test.shape

In [None]:
#Feature Engineering
X_train.dtypes

In [None]:
X_test.dtypes

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
X_train.isnull().sum()

In [None]:
X_test.isnull().sum()

In [None]:
#Filling missing values with median values
for df1 in [X_train, X_test]:
    for col in X_train.columns:
        col_median=X_train[col].median()
        df1[col].fillna(col_median, inplace=True) 

In [None]:
X_train.isnull().sum()

In [None]:
X_test.isnull().sum()

In [None]:
cols = X_train.columns

In [None]:
#Feature Scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [None]:
X_test = pd.DataFrame(X_test, columns=[cols])

In [None]:
    X_train.head()

In [None]:
# import KNeighbors ClaSSifier from sklearn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
# fit the model to the training set
knn.fit(X_train, y_train)

In [None]:
#Prediction
y_pred = knn.predict(X_test)
y_pred

In [None]:
knn.predict_proba(X_test)[:,0]

In [None]:
knn.predict_proba(X_test)[:,1]

In [None]:
from sklearn.metrics import accuracy_score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
y_pred_train = knn.predict(X_train)

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

In [None]:
# print the scores on training and test set
print('Training set score: {:.4f}'.format(knn.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(knn.score(X_test, y_test)))

In [None]:
# Print the Confusion Matrix with k =3 and slice it into four pieces
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
#Classification Metrices

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
# print the first 10 predicted probabilities of two classes- 2 and 4
y_pred_prob = knn.predict_proba(X_test)[0:10]
y_pred_prob

In [None]:
# store the probabilities in dataframe
y_pred_prob_df = pd.DataFrame(data=y_pred_prob, columns=['Prob of - benign cancer (2)', 'Prob of - malignant cancer (4)'])
y_pred_prob_df

In [None]:
# print the first 10 predicted probabilities for class 4 - Probability of malignant cancer

knn.predict_proba(X_test)[0:10, 1]

In [None]:
# store the predicted probabilities for class 4 - Probability of malignant cancer

y_pred_1 = knn.predict_proba(X_test)[:, 1]


In [None]:
d_1 = knn.predict_proba(X_test)[:, 1]
# plot histogram of predicted probabilities


# adjust figure size
plt.figure(figsize=(6,4))


# adjust the font size 
plt.rcParams['font.size'] = 12


# plot histogram with 10 bins
plt.hist(y_pred_1, bins = 10)


# set the title of predicted probabilities
plt.title('Histogram of predicted probabilities of malignant cancer')


# set the x-axis limit
plt.xlim(0,1)


# set the title
plt.xlabel('Predicted probabilities of malignant cancer')
plt.ylabel('Frequency')