## Mental Data KNN ML

[Colab NoteBook](https://colab.research.google.com/drive/1reXaYY2rGvJkvWl8G6Em7j6QWIAt5gkc)

[Github for full](https://github.com/z5208980/machine-learning-health/tree/main/stroke) (dataset, model, notebook)

[Dataset Source](https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset)

In [126]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error, precision_recall_curve
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [167]:
df = pd.read_csv('https://raw.githubusercontent.com/z5208980/machine-learning-health/main/stroke/data/raw.csv')

df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [168]:
df.drop("id", axis=1, inplace=True) # Drop index
df['bmi'].fillna(df['bmi'].mean(),inplace=True)

# Best to preprocess scale the age
features = ["age", "avg_glucose_level", "bmi"]
for feature in features:
  scaler = MinMaxScaler()
  df[feature] = scaler.fit_transform(df[[feature]])

features = ["gender", "ever_married", "work_type", "smoking_status", "Residence_type"]
# features = ['hypertension', 'heart_disease', 'ever_married', 'Residence_type', 'stroke']
encoder = LabelEncoder()
for feature in features:
  encoder = LabelEncoder()
  df[feature]=  encoder.fit_transform(df[feature])

# One Hot Encoding
# df = pd.get_dummies(df, columns=['gender', 'work_type', 'smoking_status'], prefix = ['gender', 'work_type', 'smoking_status'])

# Save
filename = '/content/sample_data/processed.csv'
df.to_csv(filename, index=False)


In [138]:
y = df.stroke
X = df.drop("stroke", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=200)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [148]:
model = SVC(C=1, kernel='linear')
model.fit(X_train, y_train)
y_pred_class = model.predict(X_test)

print('RESULT')
print('Accuracy:', metrics.accuracy_score(y_test, y_pred_class))

filename = '/content/sample_data/model.sav'
pickle.dump(model, open(filename, 'wb'))

RESULT
Accuracy: 0.9577464788732394


## Input

In [147]:
for feature in X:
  f = ' '.join(feature.split('_')).title()
  print(f"{f}")

Gender
Age
Hypertension
Heart Disease
Ever Married
Work Type
Residence Type
Avg Glucose Level
Bmi
Smoking Status


## Output

In [143]:
for target in df['stroke'].unique():
  if(target == 0): print(f"{target}: Not likely of stroke") 
  elif(target == 1): print(f"{target}: Possibly of stroke") 

1: Possibly of stroke
0: Not likely of stroke


## Demo

In [151]:
model = pickle.load(open('/content/sample_data/model.sav', 'rb'))   # load model

val = []
row = 526
for x in X.iloc[row]:
  val.append(x)

input = [val]
output = model.predict(input)

print("X=%s, Predicted=%s, Actually=%s" % (input[0], output[0], y.iloc[row]))

X=[0.0, 0.658203125, 0.0, 0.0, 1.0, 2.0, 1.0, 0.3417966946726987, 0.2531500572737686, 2.0], Predicted=0, Actually=0
