In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model    import Ridge
from sklearn.ensemble import RandomForestClassifier
import pickle

In [6]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data1 = pd.read_csv(url, names=names)
array1 = data1.values
X = array1[:,0:8]
y = array1[:,8]
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = LogisticRegression(solver='liblinear')
results = cross_val_score(model, X, y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 77.086% (5.091%)


In [5]:
# KNN Regression
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = pd.read_csv(url, delim_whitespace=True, names=names)
array = data.values
X = array[:,0:13]
y = array[:,13]
kfold = KFold(n_splits=10)
model = KNeighborsRegressor()
scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(results.mean())

-107.28683898039215


In [7]:
# Load dataset
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis()))
# Evaluate each model in turn
results = []
names = []
scoring ='accuracy'
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.769515 (0.048411)
LDA: 0.773462 (0.051592)


In [8]:
# Grid Search for Algorithm Tuning 

alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
param_grid = dict(alpha=alphas)
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_estimator_.alpha)

0.27961755931297233
1.0


In [9]:
# Random forest Classification
num_trees = 1000
max_features = 3
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, y, cv=kfold)
print(results.mean())

0.7655502392344498


In [10]:
# Save Model Using Pickle
seed = 7
test_size = 0.33
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed )
# Fit the model on 67%
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
# Save the model to disk
filename = 'finallized_model.sav'
pickle.dump(model , open(filename, 'wb'))


# Load the model from disk 
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.7559055118110236
