In [1]:
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import pandas as pd
import numpy as np
import csv

### Load dataset

In [3]:
data = datasets.load_iris()
x = data.data
y = data.target

In [74]:
hi,bin_edges = np.histogram(x, bins='auto')
bin_edges

array([0.1       , 0.80909091, 1.51818182, 2.22727273, 2.93636364,
       3.64545455, 4.35454545, 5.06363636, 5.77272727, 6.48181818,
       7.19090909, 7.9       ])

### KFolds

In [156]:
folds = 10
kf = KFold(n_splits=folds, shuffle=True)
print(kf)

KFold(n_splits=10, random_state=None, shuffle=True)


In [165]:
sum = 0
sum_f1 = 0
for train_index, test_index in kf.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = GaussianNB().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    errors = (x_test.shape[0],(y_test != y_pred).sum())
    accuracy = 100-errors[1]/errors[0]*100
    sum += accuracy
    sum_f1 += f1_score(y_test, y_pred, average='macro')
average = sum/folds
average_f1 = sum_f1/folds
print('Accuracy:', average)
print('F1:', average_f1)

Accuracy: 95.33333333333334
F1: 0.9463247863247866


### StratifiedKFolds

In [143]:
folds = 10
skf = StratifiedKFold(n_splits=folds, random_state=11)
print(skf)

StratifiedKFold(n_splits=10, random_state=11, shuffle=False)


In [146]:
sum = 0
sum_f1 = 0
for train_index, test_index in skf.split(x,y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = GaussianNB().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    errors = (x_test.shape[0],(y_test != y_pred).sum())
    accuracy = 100-errors[1]/errors[0]*100
    sum += accuracy
    sum_f1 += f1_score(y_test, y_pred, average='macro')
average = sum/folds
average_f1 = sum_f1/folds
print('Accuracy:', average)
print('F1:', average_f1)

Accuracy: 95.33333333333333
F1: 0.9524410774410773


In [13]:
dataset = pd.read_csv('cars.csv')
train, test = split(dataset)
train.head()

Unnamed: 0,Buying,Maint,Doors,Persons,Lug_boot,Safety,Class
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc


In [8]:
def split(data):
    msk = np.random.rand(len(data)) < 0.8
    train = data[msk]
    test = data[~msk]
    return train, test

### Splitting 80/20 

In [93]:
dataset = pd.read_csv('cars.csv')
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn import preprocessing

train, test = split(dataset)
train_labels = train['Class'].as_matrix()
train_attr = train.drop(['Class'], axis=1).as_matrix()

test_labels = test['Class'].as_matrix()
test_attr = test.drop(['Class'], axis=1).as_matrix()

test_train_size = 0.2

le = preprocessing.LabelEncoder()
for i in range(6):
    train_attr[:,i] = le.fit_transform(train_attr[:,i])
    
for i in range(6):
    test_attr[:,i] = le.fit_transform(test_attr[:,i])
    
model = MultinomialNB()
fit = model.fit(train_attr, train_labels)

test_pred = fit.predict(test_attr)

acc = accuracy_score(test_labels, test_pred)
prec = precision_score(test_labels, test_pred, average='macro')
recall = recall_score(test_labels,test_pred, average='macro')
f1 = f1_score(test_labels, test_pred, average='macro')

print('Accuracy: %s Precision: %s Recall: %s F1: %s' % (acc,prec,recall,f1))

Accuracy: 0.6547945205479452 Precision: 0.3303406998158379 Recall: 0.2554347826086957 F1: 0.2083560319831298


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
