In [35]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
patients_data = pd.read_csv("patients_preprocessed.csv")

In [3]:
patients_data

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult,Diagnosis_Encoded
0,1001,Patient_1,20,Diabetes,167,0
1,1002,Patient_2,74,Diabetes,153,0
2,1003,Patient_3,40,Hypertension,196,1
3,1004,Patient_4,23,Unknown,157,2
4,1005,Patient_5,35,Diabetes,115,0
...,...,...,...,...,...,...
95,1096,Patient_96,40,Diabetes,96,0
96,1097,Patient_97,60,Diabetes,192,0
97,1098,Patient_98,45,Diabetes,88,0
98,1099,Patient_99,61,Common Cold,81,4


In [5]:
patients_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PatientID          100 non-null    int64 
 1   Name               100 non-null    object
 2   Age                100 non-null    int64 
 3   Diagnosis          100 non-null    object
 4   LabResult          100 non-null    int64 
 5   Diagnosis_Encoded  100 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 4.8+ KB


In [6]:
patients_data.describe()

Unnamed: 0,PatientID,Age,LabResult,Diagnosis_Encoded
count,100.0,100.0,100.0,100.0
mean,1050.5,48.22,140.34,1.91
std,29.011492,17.538172,36.387149,1.414892
min,1001.0,20.0,80.0,0.0
25%,1025.75,35.75,108.0,0.0
50%,1050.5,47.0,137.0,2.0
75%,1075.25,62.25,172.0,3.0
max,1100.0,78.0,199.0,4.0


In [8]:
patients_data['Diagnosis'].unique()

array(['Diabetes', 'Hypertension', 'Unknown', 'Flu', 'Common Cold'],
      dtype=object)

In [7]:
patients_data['Diagnosis'].value_counts()

Flu             27
Diabetes        26
Unknown         21
Common Cold     14
Hypertension    12
Name: Diagnosis, dtype: int64

In [10]:
# Prepare the dataset for training
# We will remove rows with "Unknown" diagnosis to train the model and then predict for the "Unknown" ones
known_diagnosis_data = patients_data[patients_data['Diagnosis'] != 'Unknown']
unknown_diagnosis_data = patients_data[patients_data['Diagnosis'] == 'Unknown']

In [11]:
known_diagnosis_data

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult,Diagnosis_Encoded
0,1001,Patient_1,20,Diabetes,167,0
1,1002,Patient_2,74,Diabetes,153,0
2,1003,Patient_3,40,Hypertension,196,1
4,1005,Patient_5,35,Diabetes,115,0
5,1006,Patient_6,54,Flu,98,3
...,...,...,...,...,...,...
94,1095,Patient_95,75,Flu,108,3
95,1096,Patient_96,40,Diabetes,96,0
96,1097,Patient_97,60,Diabetes,192,0
97,1098,Patient_98,45,Diabetes,88,0


In [13]:
unknown_diagnosis_data.sample(4)

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult,Diagnosis_Encoded
18,1019,Patient_19,62,Unknown,188,2
23,1024,Patient_24,62,Unknown,95,2
8,1009,Patient_9,28,Unknown,129,2
28,1029,Patient_29,21,Unknown,160,2


In [14]:
# Features and target
X = known_diagnosis_data[['Age', 'LabResult']]
y = known_diagnosis_data['Diagnosis'] # target

In [15]:
X

Unnamed: 0,Age,LabResult
0,20,167
1,74,153
2,40,196
4,35,115
5,54,98
...,...,...
94,75,108
95,40,96
96,60,192
97,45,88


In [16]:
y

0         Diabetes
1         Diabetes
2     Hypertension
4         Diabetes
5              Flu
          ...     
94             Flu
95        Diabetes
96        Diabetes
97        Diabetes
98     Common Cold
Name: Diagnosis, Length: 79, dtype: object

In [17]:
X_train , x_test , y_train , y_test = train_test_split(X, y , test_size= 0.20 , random_state= 42)

## Decision Tree

In [18]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train , y_train)

In [19]:
accuracy = clf.score(x_test , y_test)

In [20]:
accuracy

0.0625

In [24]:
x_unknown = unknown_diagnosis_data[["Age",'LabResult']]
predicted_diagnosis = clf.predict(x_unknown)

In [25]:
predicted_diagnosis

array(['Flu', 'Flu', 'Hypertension', 'Diabetes', 'Diabetes', 'Diabetes',
       'Diabetes', 'Common Cold', 'Flu', 'Diabetes', 'Flu', 'Diabetes',
       'Common Cold', 'Diabetes', 'Common Cold', 'Hypertension', 'Flu',
       'Common Cold', 'Common Cold', 'Common Cold', 'Common Cold'],
      dtype=object)

In [28]:
unknown_diagnosis_data.sample(4)

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult,Diagnosis_Encoded
11,1012,Patient_12,66,Unknown,83,2
50,1051,Patient_51,42,Unknown,123,2
21,1022,Patient_22,50,Unknown,130,2
26,1027,Patient_27,24,Unknown,184,2


In [29]:
# Update the dataframe with the imputed values
unknown_diagnosis_data['Diagnosis'] = predicted_diagnosis

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_diagnosis_data['Diagnosis'] = predicted_diagnosis


In [30]:
unknown_diagnosis_data.sample(4)

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult,Diagnosis_Encoded
16,1017,Patient_17,21,Diabetes,80,2
6,1007,Patient_7,65,Flu,131,2
36,1037,Patient_37,39,Diabetes,139,2
23,1024,Patient_24,62,Diabetes,95,2


In [31]:
# Combine the data back together 
imputed_data = pd.concat([known_diagnosis_data, unknown_diagnosis_data], axis=0).sort_index()

In [32]:
imputed_data

Unnamed: 0,PatientID,Name,Age,Diagnosis,LabResult,Diagnosis_Encoded
0,1001,Patient_1,20,Diabetes,167,0
1,1002,Patient_2,74,Diabetes,153,0
2,1003,Patient_3,40,Hypertension,196,1
3,1004,Patient_4,23,Flu,157,2
4,1005,Patient_5,35,Diabetes,115,0
...,...,...,...,...,...,...
95,1096,Patient_96,40,Diabetes,96,0
96,1097,Patient_97,60,Diabetes,192,0
97,1098,Patient_98,45,Diabetes,88,0
98,1099,Patient_99,61,Common Cold,81,4


------

## KNN Classifier

In [36]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(x_test)
x_unknown_scaled = scaler.transform(x_unknown)

In [37]:
# train model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled , y_train)

In [38]:
# validate model on test set
knn_score = knn.score(X_test_scaled , y_test)
knn_score

0.375

------

## Full Code

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Prepare the dataset for training
# We will remove rows with "Unknown" diagnosis to train the model and then predict for the "Unknown" ones
known_diagnosis_data = patients_data[patients_data['Diagnosis'] != 'Unknown']
unknown_diagnosis_data = patients_data[patients_data['Diagnosis'] == 'Unknown']

# Features and target
X = known_diagnosis_data[['Age', 'LabResult']]
y = known_diagnosis_data['Diagnosis']

# Split the data into training and testing sets for model validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Validate the model on the test set
accuracy = clf.score(X_test, y_test)

# Predict the diagnosis for the rows with "Unknown" diagnosis
X_unknown = unknown_diagnosis_data[['Age', 'LabResult']]
predicted_diagnosis = clf.predict(X_unknown)

# Update the dataframe with the imputed values
unknown_diagnosis_data['Diagnosis'] = predicted_diagnosis

# Combine the data back together
imputed_data = pd.concat([known_diagnosis_data, unknown_diagnosis_data], axis=0).sort_index()

accuracy, imputed_data[imputed_data['Diagnosis'] == 'Unknown'].shape[0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(0.0625, 0)

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Since k-NN is sensitive to feature scales, we'll standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_unknown_scaled = scaler.transform(X_unknown)

# Train a k-NN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # We'll use the commonly chosen k=5 for simplicity
knn.fit(X_train_scaled, y_train)

# Validate the model on the test set
knn_accuracy = knn.score(X_test_scaled, y_test)

knn_accuracy


0.375