### Imports

In [1]:
import scipy as sp
import numpy
import matplotlib as plot
from matplotlib import pyplot
import pandas as pd
from pandas import set_option
import sklearn
from pandas import read_csv
filename = 'code\chapter_04\pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
indians = read_csv(filename, names=names)
indians.shape

(768, 9)

In [2]:
indians.groupby('class').size()

class
0    500
1    268
dtype: int64

In [None]:
indians.corr(method='pearson')

In [None]:
indians.skew()

In [None]:
indians.plot(kind='density',subplots=True, layout=(3,3), sharex=False)

In [None]:
indians.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)

In [None]:
correlations = indians.corr()
# plot correlation matrix
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
pyplot.show()

In [None]:
indians.corr(method='pearson')

In [3]:
r_value = indians['preg'].corr(indians['class'])
print(f"Correlation between 'preg' and 'class': {r_value}")

Correlation between 'preg' and 'class': 0.2218981530339867


In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(indians)

# MinMaxScaler (Rescale)
- Transform / pre-processing tool
- If dataset attributes vary significantly - we can rescale all the values between 0 and 1
- Referred to as normalization
- Good for optimization algorithms like gradient decent;
- Also algorithms that weight inputs like regression and neural networks and distance based measures like KNN

In [4]:
from sklearn.preprocessing import MinMaxScaler
from numpy import set_printoptions
set_printoptions(precision=3)
scalar = MinMaxScaler(feature_range=(0,1))

In [None]:
X = indians.values[:,0:8]
y = indians.values[:,8]
rescaledX = scalar.fit_transform(X)
print(X[0])
print(rescaledX[0])

# Normalizer (Normalize Data)
- Unit Vector of 1.

In [None]:
from sklearn.preprocessing import Normalizer
set_printoptions(precision=3)

### Multicollinearity Problem
- Analogy is having two friends giving you similar directions to a coffee shop. They both overlap and one of the directions is redundant.
- And we're looking for correlations in the 0.8 or above range

### Indians

In [7]:
# separate array into input and output components
X = indians.values[:,0:8]
y = indians.values[:,8]

In [None]:
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)

In [None]:
print(normalizedX[0:5,:])

[[0.034 0.828 0.403 0.196 0.    0.188 0.004 0.28 ]
 [0.008 0.716 0.556 0.244 0.    0.224 0.003 0.261]
 [0.04  0.924 0.323 0.    0.    0.118 0.003 0.162]
 [0.007 0.588 0.436 0.152 0.622 0.186 0.001 0.139]
 [0.    0.596 0.174 0.152 0.731 0.188 0.01  0.144]]


### Indian Attributes
- **preg**: Number of times pregnant
- **plas**: Plasma glucose concentration after 2 hours in an oral glucose tolerance test (OGTT)
- **pres**: Diastolic blood pressure (mm Hg)
- **skin**: Triceps skinfold thickness (mm)
- **test**: 2-hour serum insulin (mu U/ml)
- **mass**: Body mass index (BMI), calculated as weight in kg/(height in m)^2
- **pedi**: Diabetes pedigree function (a measure of diabetes likelihood based on family history)
- **age**: Age in years
- **class**: Class variable (0 or 1) indicating the presence (1) or absence (0) of diabetes


# Chap 10

### **From Intro to ML page 293**

### Confusion Matrix

In [8]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [9]:
# separate array into input and output components
X = indians.values[:,0:8]
y = indians.values[:,8]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [10]:
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [11]:
predicted = model.predict(X_test)

In [15]:
print(confusion_matrix(y_test, predicted)) # Note order of paramters

[[119  11]
 [ 26  36]]
