In [3]:
def decision_1(x):
    if x[0] == "yes":
        if x[1] < 29.5:
            decision = "less"
        else:
            decision = "more"
    else:
        if x[2] == "good":
            decision = "less"
        else:
            decision = "more"
    return decision



LESS = 'less'
MORE = 'more'

def risk_based_on_age(age):
    return LESS if age < 29.5 else MORE

def risk_based_on_diet(diet):
    return LESS if diet == "good" else MORE

def risk_based_on_being_smoker(smoker, age, diet):
    return risk_based_on_age(age) if smoker == 'yes' else risk_based_on_diet(diet)

def decision_2(x):
    smoker, age, diet = x
    return risk_based_on_being_smoker(smoker, age, diet)


    

In [4]:
x = ('yes', 31, 'good')
assert decision_2(x) == 'more'

In [5]:
decision_2(x)

'more'

In [6]:
def get_test_data_from_line(line_of_data):
    smoke, age, diet = line_of_data
    return smoke, int(age), diet  

def mapping_lines_from_file(fn, file_name):
    with open(file_name, 'r') as health_test_file:
        line_of_data = list()
        line_of_data = (line.strip().split(',') for line in health_test_file)
    
        return list(map(fn, line_of_data))

def gettest():
    file_name = "health-test.txt"
    
    return mapping_lines_from_file(get_test_data_from_line, file_name)
    
    

In [7]:
gettest()

[('yes', 21, 'poor'),
 ('no', 50, 'good'),
 ('no', 23, 'good'),
 ('yes', 45, 'poor'),
 ('yes', 51, 'good'),
 ('no', 60, 'good'),
 ('no', 15, 'poor'),
 ('no', 18, 'good')]

In [8]:
def evaluate_testset_1():
    evaluation = list()
    count = 0
    evaluation = [decision_1(person) for person in gettest()]
    
    for ratio_more in evaluation:
        if ratio_more == 'more':
            count+=1
    return count/len(evaluation)
  
    
def evaluate_testset_2():
    dataset_test = gettest()
    decisions = map(decision_1, dataset_test)
    count_MORE = sum((result == MORE for result in decisions))
    ratio_MORE = count_MORE / len(dataset_test)
    return ratio_MORE

In [9]:
evaluate_testset_1()

0.375

In [10]:
evaluate_testset_2()

0.375

In [11]:
def get_train_data_from_line(train_line):
    features, label = get_test_data_from_line(train_line[:-1]), train_line[-1]
    return tuple(features), label

def gettrain():
    file_name = 'health-train.txt'
    return mapping_lines_from_file(get_train_data_from_line, file_name)
    

In [12]:
gettrain()

[(('yes', 54, 'good'), 'less'),
 (('no', 55, 'good'), 'less'),
 (('no', 26, 'good'), 'less'),
 (('yes', 40, 'good'), 'more'),
 (('yes', 25, 'poor'), 'less'),
 (('no', 13, 'poor'), 'more'),
 (('no', 15, 'good'), 'less'),
 (('no', 50, 'poor'), 'more'),
 (('yes', 33, 'good'), 'more'),
 (('no', 35, 'good'), 'less'),
 (('no', 41, 'good'), 'less'),
 (('yes', 30, 'poor'), 'more'),
 (('no', 39, 'poor'), 'more'),
 (('no', 20, 'good'), 'less'),
 (('yes', 18, 'poor'), 'less'),
 (('yes', 55, 'good'), 'more')]

In [13]:
def data_distances_discrete(a, b):
    return (a[0] != b[0]) + ((a[1] - b[1]) / 50) ** 2 + (a[2] != b[2]) 
  
def neighbor(x, trainset):
    features = [feature[0] for feature in trainset]
    
    distances = [data_distances_discrete(x, y) for y in features]
    min_distance_index = distances.index(min(distances))
    neighbor = trainset[min_distance_index][1]
    
    return neighbor



In [14]:
x = ('yes', 31, 'good')
neighbor(x, gettrain())

'more'

In [15]:
# comparing between decision tree and Nearest Neighbor Classifier

def nnc_map_fn(train_set):
    return lambda x: neighbor(x, train_set)

def compare():
    
    test_set = gettest()
    train_set = gettrain()
    
    dt_labels = map(decision_2, test_set)
    nnc_labels = map(nnc_map_fn(train_set), test_set)
    dt_nnc_labels = zip(dt_labels, nnc_labels)
    
    Xdisagree = [test_set[dt_nnc_index] for dt_nnc_index, dt_nnc in enumerate(dt_nnc_labels) if dt_nnc[0] != dt_nnc[1]]
    probability = len(Xdisagree)/len(test_set)
    
    return Xdisagree, probability

In [16]:
compare()

([('yes', 51, 'good')], 0.125)

In [17]:
def reduce(fn, iterable):
    
    #change the type to iter
    iterable = iter(iterable)
    
    #reduced get the first item in the List
    reduced = next(iterable)
    for element in iterable:
        #here element has the second item in iterable list
        reduced = fn(reduced, element)
    return reduced
        
    
    
def data_distances_continuous(a, b):
    return (a[0] - b[0]) ** 2 + ((a[1] - b[1]) / 50.0) ** 2 + (a[2] - b[2]) ** 2
    
class NearestMeanClassifier:
    
    def _collect_classes(self, dataset):
        class_collection = {}
        
        for item, label in dataset:
            # here I can use try and except instead of if else command
            if label in class_collection:
                class_collection[label].append(item)
            else:
                class_collection[label] = [item]
            
        return class_collection
    
    def _map_collection_mean(self, class_collection):
        for label, items in class_collection.items():
            class_collection[label] = self._calc_items_mean(items)
        return class_collection
    
    def _calc_items_mean(self, items):
        total_class = reduce(self._sum_classes_vectors, items)
        return tuple(map(lambda x: x / len(items), total_class))
        
    def _sum_classes_vectors(self, a, b):
        return map(sum, zip(a,b))
        
    def train(self, dataset):
        class_collection = self._collect_classes(dataset)
        
        # the means var. I can replace it simply with return 
        # and getting the means into a var. after creating object from this class
        
        self.means = self._map_collection_mean(class_collection)
        
    def predict(self, x):
        nearest_label = None
        smallest_distance = float('inf')
        
        for label, mean in self.means.items():
            current_distance = data_distances_continuous(x, mean)
            if current_distance < smallest_distance:
                smallest_distance = current_distance
                nearest_label = label
        
        return nearest_label

In [18]:
value_mapping = { 'yes': 1.0, 'no': 0.0, 'good': 0.0, 'poor': 1.0}

def discrete_to_continuous(value):
    
    # the second arg is for checking if mapping or the converting to digits is available, else return original 
    # we do this here because of the age because it is already a number 
    # so it should not be converted
    
    return value_mapping.get(value, value)

def convert(dataset):
    
    # here we get the label and the item from the dataset and mapping the values of each item 
    # to function discrete_to_continuous to convert it to digits instead of strings
    
    return [(tuple(map(discrete_to_continuous, vector)), label) for vector, label in dataset]

def build_and_train():
    classifier = NearestMeanClassifier()
    training_data = convert(gettrain())
    classifier.train(training_data)
    
    return classifier.means
build_and_train()

{'less': (0.3333333333333333, 32.111111111111114, 0.2222222222222222),
 'more': (0.5714285714285714, 37.142857142857146, 0.5714285714285714)}

In [28]:
def predict_test():
    
    test_data = gettest()
    training_data = gettrain()
    
    nnc_predict = nnc_map_fn(training_data)
    nmc = NearestMeanClassifier()
    nmc.train(convert(training_data))
    
    agreed_samples = []
    for item in test_data:
        
        dt_result = decision_2(item)
        nnc_result = nnc_predict(item)
        
        item_converted = tuple(map(discrete_to_continuous, item))
        nmc_result = nmc.predict(item_converted)
        
        if dt_result == nnc_result == nmc_result:
            agreed_samples.append( (item, dt_result) )
    return agreed_samples

predict_test()

[(('no', 50, 'good'), 'less'),
 (('no', 23, 'good'), 'less'),
 (('yes', 45, 'poor'), 'more'),
 (('no', 60, 'good'), 'less'),
 (('no', 15, 'poor'), 'more'),
 (('no', 18, 'good'), 'less')]