In [1]:
!pip install pandas scikit-learn numpy



In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

### Data set

In [2]:
diabetes_report_dataset = pd.read_csv('diabetes.csv')

In [3]:
diabetes_symptom_dataset = pd.read_csv('Dataset 2 _ Early-stage diabetes risk prediction dataset (ESDRPD).csv')

In [4]:
diabetes_symptom_dataset.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity,Class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [5]:
diabetes_report_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Getting data ready for Processing

In [6]:
diabetes_symptom_dataset['Gender'].replace(to_replace=['Female', 'Male'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Polyuria'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Polydipsia'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Sudden weight loss'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Weakness'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Polyphagia'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Genital thrush'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Visual blurring'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Itching'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Irritability'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Delayed healing'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Partial paresis'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Muscle stiffness'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Alopecia'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Obesity'].replace(to_replace=['No', 'Yes'], value=[0, 1], inplace = True)
diabetes_symptom_dataset['Class'].replace(to_replace=['Negative', 'Positive'], value=[0, 1], inplace = True)

In [7]:
diabetes_symptom_dataset

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity,Class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,48,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,58,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,32,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0


In [8]:
# preparing data set for classification
X = diabetes_report_dataset.drop( columns = 'Outcome', axis = 1)
y = diabetes_report_dataset['Outcome']

In [9]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


# Standardizing data in a particular range for better predictions 

In [10]:
scalar = StandardScaler()

In [11]:
scalar.fit(X)

In [12]:
diabetes_reports_tranformed = scalar.transform(X)

# 

In [13]:
diabetes_reports_tranformed

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

# Now data is in range of -1 to 1

In [14]:
X = diabetes_reports_tranformed
Y = diabetes_report_dataset['Outcome']

# Train Test Split

In [60]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

In [61]:
classifier = svm.SVC(kernel='linear')

In [62]:
classifier.fit(x_train, y_train)

# Model Evauluation

In [63]:
x_train_predictions = classifier.predict(x_train)
training_data_accuracy = accuracy_score(x_train_predictions, y_train)
training_data_accuracy

0.9350961538461539

# Test data evaluationm

In [64]:
x_test_predictions = classifier.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_predictions, y_test)
testing_data_accuracy

0.9038461538461539

In [65]:
# Doing complete procedure on given inputs
data = (4,110,92,0,0,37.6,0.191,30)

# change into numpy array
data_array = np.asarray(data)

# reshape data
data_reshape = data_array.reshape(1,-1)

# standardize data
standard_data = scalar.transform(data_reshape)

# Predict the outcome
prediction = classifier.predict(standard_data)
print(f'The Prediction is : {prediction}')



ValueError: X has 8 features, but SVC is expecting 16 features as input.

# Saving the bot

In [21]:
import pickle as pk

In [22]:
pk.dump(classifier, open('Diabetes_report_model.pkl', 'wb'))

In [23]:
model = pk.load(open('Diabetes_report_model.pkl','rb'))

In [24]:
model.predict(standard_data)

array([0], dtype=int64)

# Making bot for Symptoms

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
Classifier = RandomForestClassifier()

# Getting data ready for classification

In [23]:
diabetes_symptom_dataset

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity,Class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,48,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,58,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,32,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0


In [24]:
X = diabetes_symptom_dataset.drop('Class', axis = 1)
Y = diabetes_symptom_dataset['Class']

In [25]:
X

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0
516,48,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0
517,58,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1
518,32,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0


In [26]:
Y

0      1
1      1
2      1
3      1
4      1
      ..
515    1
516    1
517    1
518    0
519    0
Name: Class, Length: 520, dtype: int64

In [45]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 89) 

# Training the model

In [46]:
Classifier.fit(x_train, y_train)

# Training Data Accuracy

In [47]:
train_predictions = Classifier.predict(x_train)
train_accuracy = accuracy_score(train_predictions, y_train)
train_accuracy

1.0

# Testing data accuracy

In [48]:
test_predictions = Classifier.predict(x_test)
test_accuracy = accuracy_score(test_predictions, y_test)
test_accuracy

0.9903846153846154

In [56]:
pk.dump(Classifier, open('Diabetes_symptoms_model.pkl', 'wb'))

In [57]:
model = RandomForestClassifier()

In [103]:
X = diabetes_reports_tranformed
Y = diabetes_report_dataset['Outcome']

In [104]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

In [60]:
model.fit(x_train, y_train)

# Training Data Accuracy

In [61]:
train_predictions = model.predict(x_train)
train_accuarcy = accuracy_score(train_predictions, y_train)
train_accuracy

1.0

# Testing Data Accuaracy

In [62]:
test_predictions = model.predict(x_test)
test_accuracy = accuracy_score(test_predictions, y_test)
test_accuracy

0.7532467532467533

In [63]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [64]:
threshold = 0 

In [100]:
predictions_binary = (test_predictions > threshold).astype(int)
actual_binary = (y_test > threshold).astype(int)

In [101]:
precision = precision_score(actual_binary, predictions_binary)
recall = recall_score(actual_binary, predictions_binary)
f1 = f1_score(actual_binary, predictions_binary)

In [102]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Precision: 1.0
Recall: 0.984375
F1 Score: 0.9921259842519685


In [106]:
threshold = 0 
predictions_binary = (x_test_predictions > threshold).astype(int)
actual_binary = (y_test > threshold).astype(int)
precision = precision_score(actual_binary, predictions_binary)
recall = recall_score(actual_binary, predictions_binary)
f1 = f1_score(actual_binary, predictions_binary)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Precision: 0.7567567567567568
Recall: 0.5185185185185185
F1 Score: 0.6153846153846154


In [107]:
from sklearn.metrics import confusion_matrix

In [108]:
confusion_matrix(x_test_predictions, y_test)

array([[91, 26],
       [ 9, 28]], dtype=int64)

In [113]:
confusion_matrix(test_predictions, y_test)

array([[40,  1],
       [ 0, 63]], dtype=int64)

In [59]:
# getting data
x = (30,1,0,0,1,1,1,0,0,0,1,1,1,1,1,0)

# setting it as numpy array 
data = np.asarray(x)

# making correct dimensions
data_reshape = data.reshape(1,-1)

# Predicting data
Result = Classifier.predict(data_reshape)

# Results
print(f'The results of X is : {Result}')

The results of X is : [1]


