In [28]:
import pandas as pd

filename = '../../data/prima-indians-diabetes.csv'
names = ['Preg', 'Plas', 'Pres', 'Skin',
         'Test', 'Mass', 'Pedi', 'Age', 'Class']

df = pd.read_csv(filename, names=names)
array = df.values
# Separate the array into input and output components
x = array[:, :8]
y = array[:, 8]

In [29]:
from sklearn.model_selection import KFold # Iteraciones
from sklearn.model_selection import cross_val_score # Metodo de validación
from sklearn.linear_model import LogisticRegression # Modelo matemático

num_folds = 10
kfold = KFold(n_splits=num_folds, shuffle=True)
"""
solver: Metodo de penalización
"""
model = LogisticRegression(solver="lbfgs",max_iter=1000)
results = cross_val_score(model, x, y, cv=kfold)
display(results)
mean_p = results.mean()*100.0
std_p = results.std()*100.0
print(f"Accuracy: {mean_p:,.2f}% {std_p:,.2f}%")


array([0.79220779, 0.83116883, 0.74025974, 0.75324675, 0.79220779,
       0.84415584, 0.81818182, 0.72727273, 0.76315789, 0.69736842])

Accuracy: 77.59% 4.53%


In [30]:
from sklearn.model_selection import train_test_split # División por porcentaje
from sklearn.linear_model import LogisticRegression

test_size = .33
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=test_size)
model = LogisticRegression(solver="lbfgs", max_iter=1000)
model.fit(x_train, y_train)
results = model.score(x_test, y_test)
display(results)
print(f"Accuracy: {results.mean()*100.0:,.2f}%")


0.7913385826771654

Accuracy: 79.13%


In [31]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
num_folds = 10
num_repeated = 5
repeatedkfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeated)
model=LogisticRegression(solver='lbfgs', max_iter=1000)
results = cross_val_score(model, x, y, cv=repeatedkfold)
mean_p = results.mean()*100.0
std_p = results.std()*100.0
print(f"Accuracy: {mean_p:,.2f}% {std_p:,.2f}%")


Accuracy: 77.42% 5.66%


In [33]:
# Evaluate using leave one out cross validation
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

loocv = LeaveOneOut()
model = LogisticRegression(solver='lbfgs', max_iter=1000)
results = cross_val_score(model, x, y, cv=loocv)
mean_p = results.mean()*100.0
std_p = results.std()*100.0
print(f"Accuracy: {mean_p:,.2f}% {std_p:,.2f}%")


Accuracy: 77.60% 41.69%


In [None]:
# Evaluate using shuffle split cross validation
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

test_size = 0.33
n_splits = 10
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=test_size)
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
results = cross_val_score(model, x, y, cv=kfold)
mean_p = results.mean()*100.0
std_p = results.std()*100.0
print(f"Accuracy: {mean_p:,.2f}% {std_p:,.2f}%")