In [37]:
# methodology on creating a machine learning program that will predict the diabetes outcome given all the features
# use pandas to help describe the data
# shows no. of rows and columns at the end... i.e. the data shape

import pandas
pima_data = pandas.read_csv('https://modcom.co.ke/data/datasets/pima.csv')
print('PIMA INDIANS DATA:\n\n',pima_data)

PIMA INDIANS DATA:

      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
5              5      116             74              0        0  25.6   
6              3       78             50             32       88  31.0   
7             10      115              0              0        0  35.3   
8              2      197             70             45      543  30.5   
9              8      125             96              0        0   0.0   
10             4      110             92              0        0  37.6   
11            10      168             74              0        0  38.0   
12            10 

In [38]:
# describe the data

print(pima_data.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [39]:
# from describe function we see that all columns are available therefore no need to manipulate data
# we therefore look for null data using isnull function

print(pima_data.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [40]:
# no null data to fix from above
# train model to predict outcome.... we will use the first 8 columns to predict the 9th column
# to split the data we have to convert it into an array

pima_data_array = pima_data.values
print('Pima Indians data in form of an array:\n\n', pima_data_array)

# 8 inputs and 1 output
x = pima_data_array[:, 0:8]
y = pima_data_array[:, 8]
print('Inputs to predict outcome:\n\n', x)
print('Output showing the outcome of the prediction:\n\n', y)

Pima Indians data in form of an array:

 [[  6.    148.     72.    ...   0.627  50.      1.   ]
 [  1.     85.     66.    ...   0.351  31.      0.   ]
 [  8.    183.     64.    ...   0.672  32.      1.   ]
 ...
 [  5.    121.     72.    ...   0.245  30.      0.   ]
 [  1.    126.     60.    ...   0.349  47.      1.   ]
 [  1.     93.     70.    ...   0.315  23.      0.   ]]
Inputs to predict outcome:

 [[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]
Output showing the outcome of the prediction:

 [1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1.
 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0.
 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 

In [41]:
# for machine learning I choose 63.2% training and 36.8% testing
# program predicts outcome from user's input 
# machine learning library sklearn
# Logistic regression can be used to model and solve binary classification problems.

import sklearn
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.368, random_state = 42)

# confirm splitting
print('Pima Indians data shape:\n', pima_data_array.shape, '\nx_train data shape:\n', x_train.shape , 
      '\nx_test data shape:\n', x_test.shape, '\ny_train data shape:\n', y_train.shape, '\ny_test data shape:\n', y_test.shape)

Pima Indians data shape:
 (768, 9) 
x_train data shape:
 (485, 8) 
x_test data shape:
 (283, 8) 
y_train data shape:
 (485,) 
y_test data shape:
 (283,)


In [42]:
# import warnings to remove solver warnings incurred
# import logistic regression library
# TRAIN
import warnings
warnings.filterwarnings('ignore', category = FutureWarning)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
print('\n\nUsing Logistic Regression pipeline, model is training...\n\n')

# TEST
y_outcome = lr.predict(x_test)

# ACCURACY
from sklearn.metrics import accuracy_score
print('The accuracy of the logistic regression pipeline is:\n\n', accuracy_score(y_test, y_outcome))

# INPUT FROM USER
new_input = {}
print('\nPlease use the following format to key in the input; e.g.\nPregnancy: 3\nGlucose: 100\nBloodPressure: 86\nSkinThickness: 18\nInsulin: 70\nBMI: 23.5\nDiabetesPedigreeFunction: 0.179\nAge: 29\n')

# arranging 8 element inputs in form of an array
for i in range(8):
    element = input('Enter a new variable: ').split()
    # assign first item to key and second item to value which should be converted to float
    new_input[element[0]] = float(element[1])
    
# confirm all values are in the dictionary
print('\n\nNew dictionary values of unknown Pima Indian: \n', new_input)

# PREDICT from user's input
# extract values from dictionary and create list to be able to predict the data

new_inputvariables = list(new_input.values())
# confirm values are extracted and a list is created
print('\nExtracted list of unknown Pima Indian values: \n', new_inputvariables)

d_status = lr.predict([new_inputvariables])
print('\n\nPLEASE NOTE:\n\n 0 -> NEGATIVE\n 1 -> POSITIVE\n\n YOUR STATUS IS: ', d_status)



Using Logistic Regression pipeline, model is training...


The accuracy of the logistic regression pipeline is:

 0.7773851590106007

Please use the following format to key in the input; e.g.
Pregnancy: 3
Glucose: 100
BloodPressure: 86
SkinThickness: 18
Insulin: 70
BMI: 23.5
DiabetesPedigreeFunction: 0.179
Age: 29

New variable: Pregnancy: 3
New variable: Glucose: 100
New variable: BloodPressure: 86
New variable: SkinThickness: 18
New variable: Insulin: 70
New variable: BMI: 23.5
New variable: DiabetesPedigreeFunction: 0.179
New variable: Age: 29


New dictionary values of unknown Pima Indian: 
 {'Pregnancy:': 3.0, 'Glucose:': 100.0, 'BloodPressure:': 86.0, 'SkinThickness:': 18.0, 'Insulin:': 70.0, 'BMI:': 23.5, 'DiabetesPedigreeFunction:': 0.179, 'Age:': 29.0}

Extracted list of unknown Pima Indian values: 
 [3.0, 100.0, 86.0, 18.0, 70.0, 23.5, 0.179, 29.0]


PLEASE NOTE:

 0 -> NEGATIVE
 1 -> POSITIVE

 YOUR STATUS IS:  [0.]
