In [2]:
# import dependencies

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler

In [3]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(
    Path('../Data/cancer_patient_data.csv')
)
df.head()

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,Low
1,P10,17,1,3,1,5,3,4,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,P100,35,1,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,High
3,P1000,37,1,7,7,7,7,6,7,7,...,4,2,3,1,4,5,6,7,5,High
4,P101,46,1,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,High


In [4]:
# change level variable from a string to an integer
df['Level'] = df['Level'].replace(['Low', 'Medium', 'High'], [1, 2, 3])
df

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,1
1,P10,17,1,3,1,5,3,4,2,2,...,1,3,7,8,6,2,1,7,2,2
2,P100,35,1,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,3
3,P1000,37,1,7,7,7,7,6,7,7,...,4,2,3,1,4,5,6,7,5,3
4,P101,46,1,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,P995,44,1,6,7,7,7,7,6,7,...,5,3,2,7,8,2,4,5,3,3
996,P996,37,2,6,8,7,7,7,6,7,...,9,6,5,7,2,4,3,1,4,3
997,P997,25,2,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,3
998,P998,18,2,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,3


In [5]:
# Seperate the features, X,  from the target variable, y
y = df['Level']
X = df.drop(columns='Level')

# display
X.head()

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Coughing of Blood,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring
0,P1,33,1,2,4,5,4,3,2,2,...,4,3,4,2,2,3,1,2,3,4
1,P10,17,1,3,1,5,3,4,2,2,...,3,1,3,7,8,6,2,1,7,2
2,P100,35,1,4,5,6,5,5,4,6,...,8,8,7,9,2,1,4,6,7,2
3,P1000,37,1,7,7,7,7,6,7,7,...,8,4,2,3,1,4,5,6,7,5
4,P101,46,1,6,8,7,7,7,6,7,...,9,3,2,4,1,4,2,4,2,3


In [6]:
# view the first five entries of target variable
y[:5]

0    1
1    2
2    3
3    3
4    3
Name: Level, dtype: int64

In [7]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [8]:
# Review the features data
X.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Patient Id_P990,Patient Id_P991,Patient Id_P992,Patient Id_P993,Patient Id_P994,Patient Id_P995,Patient Id_P996,Patient Id_P997,Patient Id_P998,Patient Id_P999
0,33,1,2,4,5,4,3,2,2,4,...,0,0,0,0,0,0,0,0,0,0
1,17,1,3,1,5,3,4,2,2,2,...,0,0,0,0,0,0,0,0,0,0
2,35,1,4,5,6,5,5,4,6,7,...,0,0,0,0,0,0,0,0,0,0
3,37,1,7,7,7,7,6,7,7,7,...,0,0,0,0,0,0,0,0,0,0
4,46,1,6,8,7,7,7,6,7,7,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [12]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=3)

In [13]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)
y_pred

array([2, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 2, 2, 2, 3, 3, 1, 1, 1, 1, 3,
       1, 1, 3, 2, 3, 3, 1, 1, 3, 1, 2, 1, 2, 1, 2, 1, 3, 1, 3, 2, 2, 2,
       3, 3, 3, 1, 2, 3, 1, 1, 2, 2, 1, 3, 3, 3, 1, 2, 2, 2, 1, 3, 1, 3,
       3, 1, 2, 3, 1, 1, 2, 2, 1, 2, 2, 1, 3, 2, 1, 2, 3, 3, 2, 3, 2, 2,
       3, 2, 3, 2, 2, 3, 2, 3, 3, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 3, 1, 1,
       2, 2, 1, 1, 3, 2, 1, 3, 2, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 2,
       3, 3, 3, 2, 3, 2, 2, 3, 2, 1, 1, 3, 3, 2, 2, 3, 3, 3, 3, 2, 2, 2,
       1, 3, 1, 3, 2, 1, 1, 3, 1, 3, 3, 2, 3, 1, 3, 2, 3, 2, 1, 3, 3, 1,
       2, 2, 1, 3, 3, 1, 1, 1, 3, 3, 3, 2, 2, 2, 1, 1, 3, 2, 2, 3, 3, 3,
       1, 3, 1, 3, 3, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 3, 3, 2, 3, 2, 3, 1,
       3, 3, 1, 2, 1, 3, 3, 2, 3, 2, 3, 3, 1, 3, 1, 1, 1, 1, 2, 2, 1, 2,
       3, 3, 1, 1, 3, 3, 2, 2], dtype=int64)

In [14]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[78,  0,  0],
       [ 0, 78,  0],
       [ 0,  0, 94]], dtype=int64)

In [15]:
 # Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        78
           2       1.00      1.00      1.00        78
           3       1.00      1.00      1.00        94

    accuracy                           1.00       250
   macro avg       1.00      1.00      1.00       250
weighted avg       1.00      1.00      1.00       250

