## Mental Data KNN ML

[Colab NoteBook](https://colab.research.google.com/drive/1qT7Gd1duB4HdDdSGsHbByIjNF3GYyVe3)

[Github for full](https://github.com/z5208980/machine-learning-health/tree/main/hep_c/) (dataset, model, notebook)

[Dataset Source](https://www.kaggle.com/datasets/fedesoriano/hepatitis-c-dataset)


In [51]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error, precision_recall_curve
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression


In [52]:
df = pd.read_csv('https://raw.githubusercontent.com/z5208980/machine-learning-health/main/hep_c/data/raw.csv')
df.drop("Unnamed: 0", axis=1, inplace=True) # Drop index

df["Category"] = df["Category"].str.replace(".+=", '', regex=True)
df["Category"] = df["Category"].str.title()

# Encode m and f to ints
encoder = LabelEncoder()
encoder.fit(df["Sex"])
df["Sex"] = encoder.transform(df["Sex"])

# Fill in missing data
df.fillna(df.median(), inplace=True)  # Not sure if appriopate for medical values
# df.fillna(0, inplace=True)
df.head()

# Save
# filename = '/content/sample_data/processed.csv'
# df.to_csv(filename, index=False)

  del sys.path[0]


Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,Blood Donor,32,1,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,Blood Donor,32,1,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,Blood Donor,32,1,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,Blood Donor,32,1,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,Blood Donor,32,1,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [53]:
y = df.Category
X = df.drop('Category', axis=1)


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=200)

scaler = StandardScaler() # Use StandardScaler if postprocessing 
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [55]:
def LR():
  model = LogisticRegression()
  model.fit(X_train, y_train)


  y_pred_class = model.predict(X_test)

  print('RESULT')
  print('Accuracy:', metrics.accuracy_score(y_test, y_pred_class))

  filename = '/content/sample_data/model.sav'
  pickle.dump(model, open(filename, 'wb'))

def RFC():
  model = RandomForestClassifier()
  model.fit(X_train, y_train)

  y_pred_class = model.predict(X_test)

  print('RESULT')
  print('Accuracy:', metrics.accuracy_score(y_test, y_pred_class))

  filename = '/content/sample_data/model.sav'
  pickle.dump(model, open(filename, 'wb'))

def GBC():
  model = GradientBoostingClassifier()
  model.fit(X_train, y_train)

  accuracy_score(model)

def accuracy_score(model):
  y_pred_class = model.predict(X_test)

  print('RESULT')
  print('Accuracy:', metrics.accuracy_score(y_test, y_pred_class))

  filename = '/content/sample_data/model.sav'
  pickle.dump(model, open(filename, 'wb'))


LR()

RESULT
Accuracy: 0.922077922077922


## Input

In [56]:
metadata = {
    "Age": { "description": "Age" },
    "Sex": { "description": "Gender (M|F)" },
    "ALB": { "description": "Albumin (protein produced by liver)" },
    "ALP": { "description": "Alkaline Phosphatase (Blood test of ALP enzyme in blood)" },
    "ALT": { "description": "Alanine Transaminase (Blood test of ALT enzyme in blood)" },
    "AST": { "description": "Aspartate Aminotransferase (Blood test of AST enzyme in blood)" },
    "BIL": { "description": "Bilirubin" },
    "CHE": { "description": "Acetylcholinesterase (Blood test)" },
    "CHOL": { "description": "Cholesterol" },
    "CREA": { "description": "Creatinine" },
    "GGT": { "description": "Gamma-Glutamyl Transferase" },
    "PROT": { "description": "Protein" },
}
for feature in X:
  print(metadata[feature]["description"])

Age
Gender (M|F)
Albumin (protein produced by liver)
Alkaline Phosphatase (Blood test of ALP enzyme in blood)
Alanine Transaminase (Blood test of ALT enzyme in blood)
Aspartate Aminotransferase (Blood test of AST enzyme in blood)
Bilirubin
Acetylcholinesterase (Blood test)
Cholesterol
Creatinine
Gamma-Glutamyl Transferase
Protein


## Possible Outcomes

In [57]:
for target in df['Category'].unique():
  print(target)

Blood Donor
Suspect Blood Donor
Hepatitis
Fibrosis
Cirrhosis


## DEMO

[Model](#)

In [58]:
model = pickle.load(open('/content/sample_data/model.sav', 'rb'))   # load model

val = []
row = 200
for x in X_train[row]:
  val.append(x)

input = [val]
output = model.predict(input)

print("X=%s, Predicted=%s, Actually=%s" % (input[0], output[0], y_train.iloc[row]))

X=[-0.7052232780919925, 0.8040078180145834, -0.045733332806742175, -0.03147139495730363, 1.1785683229506227, 0.05812158875788527, -0.2219969538910292, -1.8083413098012011, 0.6218495773044013, -0.12303655751846336, 0.933390836209473, 0.07987049747609182], Predicted=Blood Donor, Actually=Blood Donor
