In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [64]:
# Load the dataset
file_path = 'C:/Users/91709/Downloads/2024_projects/14_HospitalPatientSurvival/training_data.csv'
data = pd.read_csv(file_path)




In [65]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
data.head()



First few rows of the dataset:


Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,16201,47,8433,DX2,60,21.655523,NO,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1
1,9421,3,2972,DX6,2,28.852743,NO,RURAL,Stable,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0
2,16205,7,8608,Dx6,20,26.179725,NO,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1
3,5582,31,10074,dx6,8,22.638945,NO,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
4,20880,43,7462,dx1,53,21.326131,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [66]:
# Display the summary statistics of the dataset
print("\nSummary statistics of the dataset:")
print(data.describe())





Summary statistics of the dataset:
       ID_Patient_Care_Situation  Diagnosed_Condition    Patient_ID  \
count               25079.000000         25079.000000  25079.000000   
mean                16509.925396            26.382631   6237.758523   
std                  9536.958469            15.096967   3603.338849   
min                     1.000000             0.000000      1.000000   
25%                  8220.000000            13.000000   3125.000000   
50%                 16466.000000            26.000000   6228.000000   
75%                 24784.000000            40.000000   9356.500000   
max                 33013.000000            52.000000  12514.000000   

        Patient_Age  Patient_Body_Mass_Index             A             B  \
count  25079.000000             25079.000000  23723.000000  23723.000000   
mean      33.193548                23.439952      0.894111      0.136028   
std       19.507617                 3.778849      0.307702      0.342826   
min        0.000000 

In [67]:
# Display the info of the dataset
print("\nInfo of the dataset:")
print(data.info())




Info of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25079 entries, 0 to 25078
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  25079 non-null  int64  
 1   Diagnosed_Condition        25079 non-null  int64  
 2   Patient_ID                 25079 non-null  int64  
 3   Treated_with_drugs         25071 non-null  object 
 4   Patient_Age                25079 non-null  int64  
 5   Patient_Body_Mass_Index    25079 non-null  float64
 6   Patient_Smoker             25079 non-null  object 
 7   Patient_Rural_Urban        25079 non-null  object 
 8   Patient_mental_condition   25079 non-null  object 
 9   A                          23723 non-null  float64
 10  B                          23723 non-null  float64
 11  C                          23723 non-null  float64
 12  D                          23723 non-null  float64
 13  E                       

In [68]:
# Check for missing values
print("\nChecking for missing values:")
print(data.isnull().sum())




Checking for missing values:
ID_Patient_Care_Situation       0
Diagnosed_Condition             0
Patient_ID                      0
Treated_with_drugs              8
Patient_Age                     0
Patient_Body_Mass_Index         0
Patient_Smoker                  0
Patient_Rural_Urban             0
Patient_mental_condition        0
A                            1356
B                            1356
C                            1356
D                            1356
E                            1356
F                            1356
Z                            1356
Number_of_prev_cond          1356
Survived_1_year                 0
dtype: int64


In [69]:
# Handling missing values (if any)
# For this example, we'll drop rows with missing values
data = data.dropna()



In [70]:
# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()



In [72]:
# Split the data into features and target
X = data.drop('Survived_1_year', axis=1)  # Replace 'Survival' with the actual target column name
y = data['Survived_1_year']  # Replace 'Survival' with the actual target column name



In [73]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [85]:
# Create a preprocessor for the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
    ],
    remainder='passthrough'  # Leave the rest of the columns untouched
)



In [86]:
# Create a pipeline that first transforms the data and then applies the scaler
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False))
])



In [87]:
# Apply the transformations to the training and test data
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)



In [77]:
# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # You can tune the 'n_neighbors' parameter
knn.fit(X_train_transformed, y_train)



In [78]:
# Make predictions on the test set
y_pred = knn.predict(X_test_transformed)



In [82]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Accuracy: {accuracy}")




Model Evaluation:
Accuracy: 0.7141049968374447


In [83]:
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[ 941  827]
 [ 529 2446]]


In [84]:
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.53      0.58      1768
           1       0.75      0.82      0.78      2975

    accuracy                           0.71      4743
   macro avg       0.69      0.68      0.68      4743
weighted avg       0.71      0.71      0.71      4743

