In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

# SK learn Models->  https://scikit-learn.org/stable/
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing

fileName = 'weather-MP.csv'
colNames = [
    'outlook', 
    'temperature', 
    'humdity', 
    'windy', 
    'play'
]

data = pd.read_csv(fileName, names=colNames)

In [None]:
# Encode all columns, including the numeric columns
# data = data.apply(preprocessing.LabelEncoder().fit_transform)
# print(data.head())

# Or better still, just transform the categorical columns, a column at a time
# creating instance of labelencoder
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

# apply to a single column
data['outlook'] = labelencoder.fit_transform(data['outlook'])
data['windy'] = labelencoder.fit_transform(data['windy'])
data['play'] = labelencoder.fit_transform(data['play'])

In [None]:
# We need to convert the dataframe into an array for the models, see the array use below.
array = data.values
#print(array)

# The 1st 4 columns are the predictor attributes that we believe effect whether or not the play goes ahead.
# The 5th and last column, the target column, is whether or not play went ahead.
# We are therefore trying to match patterns in the 1st 4 predictor columns to predict the 5th target column.
predictors = array[:,0:4]
targets = array[:,4]

In [None]:
print("Testing Method:  ALL DATA\n---------------------------------------------\n")

# Select the model
model1 = GaussianNB()

# Train the model
model1.fit(predictors, targets)

# Test the model
result1= model1.score(predictors,targets)
print("Gaussian Naive Bayes Model : ", end="");
print("Accuracy: ", round(result1*100.0, 2), "%\n")

# Select the model
model2 = LogisticRegression()

# Train the model
model2.fit(predictors,targets)

# Test the model
result2= model2.score(predictors,targets)
print("Logistic Regression Model  : ", end="");
print("Accuracy: ", round(result2*100.0, 2), "%")

In [None]:
from sklearn.model_selection import train_test_split
# use the array from earlier

seed = 1 # To get repeatable results

test_size = 0.33 # The proportion of the data to use for testing, the rest is used for training.

predictors_train, predictors_test, targets_train, targets_test = train_test_split(predictors, targets, test_size = test_size, random_state = seed)


print("Testing Method:  HOLD-OUT\n---------------------------------------------\n")

# Select the model
model1 = GaussianNB()

# Train the model
model1.fit(predictors_train,targets_train)

# Test the model
result1= model1.score(predictors_test,targets_test)
print("Gaussian Naive Bayes Model : ", end ="");
print("Accuracy: ", round(result1*100.0, 2), "%\n")

# Select the model
model2 = LogisticRegression()

# Tain the model
model2.fit(predictors_train, targets_train)

# Test the model
result2= model2.score(predictors_test, targets_test)
print("Logistic Regression Model  : ", end="");
print("Accuracy: ", round(result2*100.0, 2), "%")

In [None]:
# sklearn 10FCV 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# SK learn Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

print("Testing Method:  Cross-Fold Validation\n---------------------------------------------\n")

num_folds = 10 # The number of subsets/folds, 10 in this case.
seed = 1       # To get repeatable results

# Select the model
model1 = GaussianNB()

# The object which holds the kfold controls
kfold = KFold(n_splits=num_folds, shuffle=True, random_state = seed)

# Evaluate the score by cross validation, equivalent to multiple trains and tests
# Note that cross_val_score() returns not a single score, but an array of scores, one per fold,
# so to make sense of it, we get the mean of the scores/results
results1 = cross_val_score(model1, predictors, targets, cv = kfold)
result1 = round(results1.mean()*100.0,2)

print("Gaussian Naive Bayes Model : ", end ="");
print("Accuracy:", result1,"\tStandard Deviation", round(results1.std()*100.0,2))

print("Logistic Regression Model  : ", end="");
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)

model2 = LogisticRegression()

results2 = cross_val_score(model2, predictors, targets, cv=kfold)
result2 = round(results2.mean()*100.0,2)
print("Accuracy:", result2,"\tStandard Deviation", round(results2.std()*100.0,2))