### K-Nearest Neighbors

In [1]:
from collections import Counter
import matplotlib.pyplot as plt

In [4]:
def raw_majority_vote(labels):
    votes = Counter(labels)
    winner, _ = votes.most_common(1)[0]
    return winner

def majority_vote(labels):
    """assumes that labels are ordered from nearest to farthest"""
    vote_counts = Counter(labels)
    winner, winner_count = vote_counts.most_common(1)[0]
    num_winners = len([count
                       for count in vote_counts.values()
                       if count == winner_count])
    if num_winners == 1:
        return winner                      # unique winner, so return it
    else:
        return majority_vote(labels[:-1])  # try again without the farthest
    
    
def knn_classify(k, labeled_points, new_point):
    """each labeled points should be a pair (point, label)"""
    
    #order the labeled points from nearest to farthest
    #by_distance = sorted(labeled_points, 
    #                     lambda(point): distance(point, new_point))
    
    # find the labels for the k closest
    k_nearest_labels = [label for _, label in by_distance[:k]]
    
    # and let them vote
    return majority_vote(k_nearest_labels)
    

In [None]:
if __name__ == "__main__":
    
    # each entry is ([logitude, latitude], favorite_language)
    cities = [([-122.3, 47.53], "Python"),        # Seattle
              ([-96.85, 32.85], "Java"),          # Austin
              ([-89.33, 43.13], "R"),             # Madison
              # ... and so on
             ]
               
    # we want each language to have a different marker and color
    markers = {"Java" : "o", "Python" : "s", "R" : "^"}
    colors = {"Java" : "r", "Python" : "b", "R" : "g"}  
    
    for(longitude, latitude), language in cities:
        plots[language][0].append(longitude)
        plots[language][1].append(latitude)
        
    # create a scatter series for each language
    for language, (x, y) in plots.iteritems():
        plt.scatter(x, y, color=colors[language], marker=markers[language],
                    label=language, zorder=10)
        plot_state_borders(plt)         # pretend we have a function that does this
        
        plt.legend(loc=0)
        plt.axis([-130, -60, 20, 55])
        
        plt.title("Favorite Programming Languages")
        plt.show()
        
        
    # try several different values for k
    for k in [1, 3, 5, 7]:
        num_correct = 0
        
        for city in cities:
            location, actual_language = city
            othe_cities = [other_city
                           for other_city in cities
                           if other_city != city]
            
            predicated_language = knn_classify(k, other_cities, location)
            
            if predicated_language == actual_language:
                num_correct += 1
                
        print(k, "neighbors[s]:", num_correct, "correct_out_of", len(cities))
        
    plots = {"Java" : ([], []), "Python" : ([], [])} 
    
    
    
    
    
    
    
    