## MACHINE LEARNING MODEL FOR DIABETES PREDICTION

In [1]:
#libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
import warnings
import pickle
warnings.filterwarnings("ignore")

### Dabetes symptoms

In [2]:
#loading data
df_symptoms = pd.read_csv('useful/diabetes_data_upload_20_40.csv')
df_symptoms.head()

Unnamed: 0,Age,Gender,Frequent_Urination,Frequent_Thirst,Sudden_Weight_Loss,Weakness,Excessive_Eating,Visual_Blurring,Itching,Irritability,Delayed_Healing,Partial_Paresis,Muscle_Stiffness,Alopecia,Obesity,Class
0,40,Male,No,Yes,No,Yes,No,No,Yes,No,Yes,No,Yes,Yes,Yes,1
1,38,Male,Yes,Yes,No,No,Yes,No,Yes,No,Yes,No,Yes,No,No,1
2,35,Male,Yes,No,No,No,Yes,No,No,Yes,Yes,No,No,Yes,No,1
3,39,Male,Yes,No,Yes,No,No,No,Yes,Yes,No,No,No,Yes,No,1
4,32,Male,No,No,No,No,No,No,No,Yes,Yes,No,No,No,Yes,1


In [3]:
# converting categorical variable into numerical
label_encoder = preprocessing.LabelEncoder()
for column in df_symptoms.columns[1:]:
    df_symptoms[column] =  label_encoder.fit_transform(df_symptoms[column])
    
df_symptoms.head()

Unnamed: 0,Age,Gender,Frequent_Urination,Frequent_Thirst,Sudden_Weight_Loss,Weakness,Excessive_Eating,Visual_Blurring,Itching,Irritability,Delayed_Healing,Partial_Paresis,Muscle_Stiffness,Alopecia,Obesity,Class
0,40,1,0,1,0,1,0,0,1,0,1,0,1,1,1,1
1,38,1,1,1,0,0,1,0,1,0,1,0,1,0,0,1
2,35,1,1,0,0,0,1,0,0,1,1,0,0,1,0,1
3,39,1,1,0,1,0,0,0,1,1,0,0,0,1,0,1
4,32,1,0,0,0,0,0,0,0,1,1,0,0,0,1,1


In [4]:
#select columns based on exploration
x_var = df_symptoms[['Age','Gender','Frequent_Urination','Frequent_Thirst','Weakness','Excessive_Eating','Delayed_Healing','Partial_Paresis']]
y_var = df_symptoms['Class']

In [6]:
X_train,X_test,y_train,y_test = train_test_split(x_var,y_var,test_size = 0.2,random_state=42)

In [7]:
# normalise the columns using standard scalar
scalar_symptom = StandardScaler()
X_train = scalar_symptom.fit_transform(X_train)
X_test = scalar_symptom.transform(X_test)

In [11]:
#dataset 1 scalar file
pickle.dump(scalar_symptom,open('scalar_symptom.pkl','wb'))

In [13]:
#model trained using random forest
for i in range(1,100):
    
    rforest = RandomForestClassifier(n_estimators=i,criterion='entropy',random_state=0)
    rforest.fit(X_train,y_train)

accu = cross_val_score(estimator=rforest, X=X_train ,y=y_train,cv=10)
print("accuracy = {:.2f} %".format(accu.mean()*100))
print("standard deviation = {:.2f} %".format(accu.std()*100))

accuracy = 97.03 %
standard deviation = 4.84 %


In [14]:
#prediction
pred_symptom_diabetic = rforest.predict(X_test)
#evaluation
rf_accuracy = accuracy_score(pred_symptom_diabetic,y_test)
con_matrix = confusion_matrix(pred_symptom_diabetic,y_test)
print(rf_accuracy)
print(con_matrix)

0.9411764705882353
[[12  1]
 [ 1 20]]


In [15]:
# Neural network for converting binary to percentage
num_var = X_train.shape[1]
# 0 for not having and 1 for having diabetes
num_class = 2

# define soft_max_model with softmax activation
soft_max_model = Sequential()
soft_max_model.add(Dense(32, input_dim = num_var, activation ='relu'))
soft_max_model.add(Dense(num_class, activation ='softmax'))

# compile soft_max_model
soft_max_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# convert binary labels to categorical
from keras.utils import to_categorical
y_train_cat = to_categorical(y_train)

# train soft_max_model
soft_max_model.fit(X_train, y_train_cat, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e22bb28490>

In [16]:
# perform perdiction on test data
y_prediction = soft_max_model.predict(X_test)*100



### User Demograph

In [17]:
df_demo = pd.read_csv('useful/Diab_pyth_data.csv')
df_demo.head()

Unnamed: 0,Age,Gender,Blood_Pressure,Family_member_with_Diabetes_past_present,BMI,percentage_diabetes
0,40,Male,1,0,28,62.0
1,38,Female,1,1,20,70.0
2,39,Female,0,0,21,60.25
3,32,Female,0,1,25,51.25
4,24,Female,1,1,26,31.5


In [18]:
# converting categorical variable into numerical
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df_demo['Gender'] =  label_encoder.fit_transform(df_demo['Gender'])
    
df_demo.head()

Unnamed: 0,Age,Gender,Blood_Pressure,Family_member_with_Diabetes_past_present,BMI,percentage_diabetes
0,40,1,1,0,28,62.0
1,38,0,1,1,20,70.0
2,39,0,0,0,21,60.25
3,32,0,0,1,25,51.25
4,24,0,1,1,26,31.5


In [19]:
x_var1 = df_demo.iloc[:,0:-1]
y_var1 = df_demo.iloc[:,-1]

In [20]:
from sklearn.model_selection import train_test_split
X_train1,X_test1,y_train1,y_test1 = train_test_split(x_var1,y_var1, test_size = 0.3, random_state=42)

In [21]:
# normalise the columns using standard scalar
scalar_demo = StandardScaler()
X_train1 = scalar_demo.fit_transform(X_train1)
X_test1 = scalar_demo.transform(X_test1)

In [35]:
#dataset 2 scalar file
pickle.dump(scalar_demo,open('scalar_demo.pkl','wb'))

In [22]:
#model trained using decision tree
dtree = DecisionTreeRegressor(criterion='mse')
dtree.fit(X_train1, y_train1)

DecisionTreeRegressor(criterion='mse')

In [23]:
y_pred_demo = dtree.predict(X_test1)
mae = mean_absolute_error(y_test1, y_pred_demo)
print("Mean Absolute Error: {:.2f}".format(mae))

Mean Absolute Error: 15.79


### Combine prediction

In [24]:
#user input dataset1
#'Age','Gender','Frequent_Urination','Frequent_Thirst','Weakness','Excessive_Eating','Delayed_Healing','Partial_Paresis']]
user_symptom = np.array([40, 1, 1, 1, 1, 1, 1, 1])
user_symptom = user_symptom.reshape(1, -1)

#user input dataset2
#Age,Gender,Blood_Pressure,Family_member_with_Diabetes_past_present,BMI
user_demo = np.array([40,1,1,1,39])
user_demo = user_demo.reshape(1, -1)

# apply the scaler on the input array
user_symptom_scaled = scalar_symptom.transform(user_symptom)
user_demo_scaled = scalar_demo.transform(user_demo)

print(user_symptom_scaled)
print(user_demo_scaled)

[[1.23973793 0.83971912 1.19087439 1.20953006 1.22859023 1.7765838
  1.47196014 1.44672847]]
[[1.62552335 0.99935605 1.01200523 0.99502215 1.54760889]]


In [25]:
#prediction
pred_user_symptom = soft_max_model.predict(user_symptom_scaled)
pred_user_demo = dtree.predict(user_demo_scaled)



In [26]:
pred_user_symptom[0][1]* 100

86.13027334213257

In [27]:
pred_user_demo[0]

79.5

In [28]:
(pred_user_demo[0] + pred_user_symptom[0][1]* 100)/2

82.81513667106628

### Pickle file

In [29]:
import pickle
#dataset 1
pickle.dump(soft_max_model,open('model_symptom.pkl','wb'))

In [30]:
model_symptom = pickle.load(open('model_symptom.pkl','rb'))

In [31]:
model_symptom.predict(user_symptom_scaled)



array([[0.13869733, 0.86130273]], dtype=float32)

In [32]:
#dataset2
pickle.dump(dtree,open('model_demo.pkl','wb'))

In [33]:
model_demo = pickle.load(open('model_demo.pkl','rb'))

In [34]:
model_demo.predict(user_demo_scaled)

array([79.5])