## Classification

#### Loading and preparing the dataset

In [1]:
from sklearn.datasets import load_iris
import numpy as np

# Load the data
dataset = load_iris()

# Assign data and target
x = dataset.data
y = dataset.target

n_samples, n_features = x.shape
#print(dataset.DESCR)

In [2]:
# Discretizis the attributes by using the mean as a threshold
attribute_means = x.mean(axis=0)

In [3]:
assert attribute_means.shape == (n_features,)
x_d = np.array(x >= attribute_means,dtype='int')

### Implementing the OneR algorithm

##### Quick Book Definitions:
<b>OneR</b> is a simple algorithm that simply predicts the class of a sample by finding the most
frequent class for the feature values. OneR is shorthand for One Rule, indicating we only
use a single rule for this classification by choosing the feature with the best performance.

In [4]:
from collections import defaultdict
from operator import itemgetter

In [5]:
def train_feature_value(x, y_true, feature, value):
    """ Function to count each time a feature value corresponds to a specific class.
        Returns the most frequent class and error """
    # Create a simple dictionary to count how frequently they give certain predictions
    class_counts = defaultdict(int)
    # Iterate through each sample and count the frequency of each class/value pair
    for sample, y in zip(x, y_true):
        if sample[feature] == value:
            class_counts[y] += 1
            
    # Now get the best one by sorting (highest first) and choosing the first item
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    # The error is the number of samples that do not classify as the most frequent class
    # and have the frequent value
    n_samples = x.shape[1]
    error = sum([class_count for class_value, class_count in class_counts.items()
                 if class_value != most_frequent_class])
    return most_frequent_class, error

In [6]:
def train(x, y_true, feature):
    """ Function to find the most accurate feature value to use 
        for the specific feature as the OneR """
    # Check that variable is a valid number
    n_samples, n_features = x.shape
    assert 0 <= feature < n_features
    # Get all of the unique values that this variable has 
    values = set(x[:,feature])
    # Stores the predictors array that is returned 
    predictors = dict()
    errors = []
    
    for current_value in values:
        most_frequent_class, error = train_feature_value(x, y_true, feature, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)
    # Compute the total error of using this feature to classify on
    total_error = sum(errors)
    return predictors, total_error

In [7]:
# Since the sklearn.cross_validation module was deprecated in version 0.18 the model_selection module is used
from sklearn.model_selection import train_test_split

xd_train, xd_test, y_train, y_test = train_test_split(x_d, y, random_state=14)

#from IPython.core.debugger import Tracer; Tracer()()
# Compute the predictors for all the features 
all_predictors = {}
errors = {}
for feature_index in range(xd_train.shape[1]):
    predictors, total_error = train(xd_train, y_train, feature_index)
    all_predictors[feature_index] = predictors
    errors[feature_index] = total_error

In [8]:
# Find feature with lowest error -> oneR
best_feature, best_error = sorted(errors.items(), key=itemgetter(1))[0]

# Create model by storing the predictors for the best feature
model = {'feature': best_feature,
        'predictor': all_predictors[best_feature]}

# variable = model['feature']
# predictor = model['predictor']
# prediction = predictor[int(sample[variable])]

In [9]:
def predict(x_test, model):
    """ Function to predict several new samples at one time """
    variable = model['feature']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in x_test])
    return y_predicted

In [10]:
y_predicted = predict(xd_test, model)

accuracy = np.mean(y_predicted == y_test) * 100
print("The test accuracy is {:.1f}%".format(accuracy))

The test accuracy is 65.8%
