### Machine Learning - Brief Intro

In [None]:
# One word: demystification
# Not on the assessment/necessary for most Data Analyst job interviews/jobs

In [None]:
# Do imports and examine the data
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
pima = pd.read_csv(("https://raw.githubusercontent.com/PyDataWorkshop/datasets/master/pima.csv"))

In [117]:
pima.head()

Unnamed: 0,Preg,Gluc,Dias,Tric,2hSer,BM1,Diab,Age,Diab.1
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [118]:
pima.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Preg    768 non-null    int64  
 1   Gluc    768 non-null    int64  
 2   Dias    768 non-null    int64  
 3   Tric    768 non-null    int64  
 4   2hSer   768 non-null    int64  
 5   BM1     768 non-null    float64
 6   Diab    768 non-null    float64
 7   Age     768 non-null    int64  
 8   Diab.1  768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


<strong>Diab</strong> = Likelihood of patient diabetes based on family history<br>
<strong>Diab.1</strong> = Whether patient has diabetes

In [None]:
# Let's start with a very basic linear regression
# The simplest form of predictive analytics - can even be done in Excel
# Set up linear regression
# Initialize an empty linear regression variable
reg = LinearRegression()
reg

In [120]:
# Train test split with 80% of the data used for training, 20% for testing (sometimes hold-out data)
# Why? Because we would not want to evaluate accuracy based on biased data
# If we give 100% of data, it will have already learned all of that data
from sklearn.model_selection import train_test_split
train, test = train_test_split(pima, test_size = 0.2)
# "Test" is also the hold-out because it is removed and then
# accuracy is measured based on that
print(train.shape)
print(test.shape)
# 80% of ROWS are in train
# 20% of ROWS are in test
# Columns are the same

(614, 9)
(154, 9)
0.8094059405940595


In [121]:
# Pick the non-Diab columns as feature variables (predictors)
train_feat = train.iloc[:,:5]
# iloc reminder: rows first, then columns
# train_feat = np.array(train.iloc[:,1]).reshape(-1, 1)
# Pick the Diab column as a target variable (response)
train_targ = train["Diab"]
train_feat

Unnamed: 0,Preg,Gluc,Dias,Tric,2hSer
585,1,93,56,11,0
128,1,117,88,24,145
392,1,131,64,14,415
108,3,83,58,31,18
109,0,95,85,25,36
...,...,...,...,...,...
103,1,81,72,18,40
294,0,161,50,0,0
427,1,181,64,30,180
501,3,84,72,32,0


In [122]:
train_targ

585    0.417
128    0.403
392    0.389
108    0.336
109    0.247
       ...  
103    0.283
294    0.254
427    0.328
501    0.267
41     0.696
Name: Diab, Length: 614, dtype: float64

In [124]:
# Fit the model
# Tell Python what we want to predict
# Order is predictor, target
reg.fit(train_feat, train_targ)

LinearRegression()

In [125]:
# Test variables
test_feat = test.iloc[:,:5]
# test_feat = np.array(test.iloc[:,1]).reshape(-1, 1)
test_targ = test["Diab"]
print(test_feat)
print(test_targ)

     Preg   Gluc  Dias  Tric  2hSer
562      1    87    68    34     77
460      9   120    72    22     56
483      0    84    82    31    125
450      1    82    64    13     95
240      1    91    64    24      0
..     ...   ...   ...   ...    ...
442      4   117    64    27    120
146      9    57    80    37      0
257      2   114    68    22      0
322      0   124    70    20      0
259     11   155    76    28    150

[154 rows x 5 columns]
562    0.401
460    0.733
483    0.233
450    0.415
240    0.192
       ...  
442    0.230
146    0.096
257    0.092
322    0.254
259    1.353
Name: Diab, Length: 154, dtype: float64


In [126]:
# Predict the age of patients for the first five values
print("Predictions: ", "\n", reg.predict(test_feat)[0:5,], "\nActual: \n",\
      test_targ[0:5,], "\n", "Slope: ", reg.coef_, "Intercept: ", reg.intercept_)
# How does this work? Each feature variable is multiplied by the slope and added to the intercept
# For example, if for every horsepower a car is $100 more, multiply $100 times each mpg obtained

Predictions:  
 [0.4933748  0.45630597 0.49187907 0.43442806 0.45078751] 
Actual: 
 562    0.401
460    0.733
483    0.233
450    0.415
240    0.192
Name: Diab, dtype: float64 
 Slope:  [-0.00260826  0.00074559 -0.00031292  0.00288871  0.00023291] Intercept:  0.33624457186212425


In [None]:
reg.predict(test_feat)

In [127]:
from sklearn.metrics import r2_score
print("R squared:", r2_score(test_targ.values, reg.predict(test_feat)))

R squared: 0.08236603616885263


### Logistic Regression

In [128]:
# Set up logistic regression
# Logistic regression predicts the ODDS, rather than the OUTCOME - see the binary column above
# If its prediction is 0.5 or greater, predict patient has diabetes; if 0.49 or less, predict they do not
# Import from sklearn
from sklearn.linear_model import LogisticRegression
# Initialize an empty logistic regression variable
lr = LogisticRegression()
lr

LogisticRegression()

In [129]:
# Train test split with 80% of the data used for training, 20% for testing
from sklearn.model_selection import train_test_split
train, test = train_test_split(pima, test_size = 0.2)
# Subset all the columns besides Diab.1
train_feat = train.iloc[:,:8]
# Subset just the Diab.1 column
train_targ = train["Diab.1"]

In [131]:
# Populate our null variable with predictions
lr.fit(train_feat, train_targ)
# Subset all the columns in Test besides Diab.1
test_feat = test.iloc[:,:8]
# Subset just the Diab.1 column
test_targ = test['Diab.1']

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [132]:
# See how accurate the model is
lr.score(test_feat, test_targ)

0.7402597402597403

In [133]:
# We need more details
# Let's look at our predictions
lr.predict(test_feat)

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0])

In [134]:
# Rather than comparing every prediction to every actual, we can create what's called a confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(lr.predict(test_feat), test_targ)

array([[85, 27],
       [13, 29]])

We have two potential correct outcomes, as well as two potential incorrect outcomes. Our incorrect outcomes can take the following forms:

Our model made a positive prediction, and it was wrong. (False Positive)

Our model made a negative prediction, and it was wrong. (False Negative)

Percentage of true positives = TP / (TP + FP) aka <strong>precision</strong><br>
Percentage of true negatives = TN / (TN + FN) aka <strong>recall</strong>

While the overall accuracy is also a useful metric, the difference between these concepts in a business context can have large ramifications, making simple accuracy often not enough for a robust model.

In [None]:
# Seaborn <=> Confusion Matrix
"""labels = [‘True Neg’,’False Pos’,’False Neg’,’True Pos’]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt=‘’, cmap='Blues')
group_names = [‘True Neg’,’False Pos’,’False Neg’,’True Pos’]
group_counts = [“{0:0.0f}”.format(value) for value in
cf_matrix.flatten()]
group_percentages = [“{0:.2%}”.format(value) for value in
cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f”{v1}\n{v2}\n{v3}” for v1, v2, v3 in
zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt=‘’, cmap='Blues')"""

In [135]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print("Precision:", precision_score(lr.predict(test_feat), test_targ))
print("Recall:", recall_score(lr.predict(test_feat), test_targ))

Precision: 0.5178571428571429
Recall: 0.6904761904761905


### Different Algorithms

In [152]:
# Import three new models
from sklearn.svm import LinearSVC 
from sklearn.neural_network import MLPClassifier 
from sklearn.ensemble import RandomForestClassifier
# Initialize the models
svm = LinearSVC()
mlp = MLPClassifier()
rf = RandomForestClassifier()
# Train test split
train, test = train_test_split(pima, test_size=.2)  
train_feat = train.iloc[:, :8]
train_targ = train["Diab.1"]
# Fit the models
svm.fit(train_feat, train_targ)
mlp.fit(train_feat, train_targ)
rf.fit(train_feat, train_targ)
# Examine the scores
print("SVM accuracy", svm.score(test_feat, test_targ))
print("Neural net accuracy", mlp.score(test_feat, test_targ))
print("Random forest", rf.score(test_feat, test_targ))
# Bootstrapping allows you to test multiple accuracies
# Then average them out to find total accuracy



SVM accuracy 0.6363636363636364
Neural net accuracy 0.7662337662337663
Random forest 0.935064935064935


In [157]:
# Confusion matrices
confusion_matrix(svm.predict(test_feat), test_targ)

array([[98, 56],
       [ 0,  0]])

In [158]:
confusion_matrix(mlp.predict(test_feat),test_targ)

array([[90, 28],
       [ 8, 28]])

In [159]:
confusion_matrix(rf.predict(test_feat),test_targ)
# Random forest is by far the most successful; go with that one

array([[94,  6],
       [ 4, 50]])