In [1]:
import clustering_code
from collections import defaultdict
from pprint import pprint

input_file = "survey_responses.txt"

Clustering Code Loaded


The survey has the following fields : 

* Timestamp	
* What's your family or last name?	
* Distance from where you were born to UMT.	
* Distance from where you lived at 15 to UMT.	
* How many years have you been in post-secondary school?	
* Were you an undergraduate marketing major? 	
* Were you an undergraduate business major?	
* How many people live in your house/apartment (including you)? 

We'll work with this data to do a little bit of clustering. The code is based on Joel Grus's from _Data Science from Scratch_ with a few modifications. 

In [3]:
student_data = defaultdict(list)
with open(input_file,'r') as ifile :
    next(ifile)
    for row in ifile.readlines() :
        row = row.strip().split("\t")
        this_student = row[1]
        student_data[this_student] = row[2:]


We need numerical data for clustering, so we'll convert over the Yes/No responses.

In [4]:
# Let's change No to 0 and Yes to 1, so everything is numerical
for student in student_data :
    this_data = student_data[student] # get the list of data 

    for idx, item in enumerate(this_data) : # iterate over the list (and its index)
        if item == "No" :
            this_data[idx] = 0 # change the "No" spot to 0
        elif item == "Yes" :
            this_data[idx] = 1 # change the "Yes" spot to 1 
            
    student_data[student] = [float(item) for item in this_data] 
        # overwrite the old list with the new one. Also make everything numeric
            

In [5]:
# Let's just print the data so it's easier to see
pprint(student_data)# Let's make a function that prints the means in a nice way.

def pprint_means(the_means) :
    var_labels = ["Birth Dist","Age 15 Dist",
                  "Post-Secondary","Mkt Major",
                  "Biz Major","HH Size"]
    for idx, cluster_mean in enumerate(the_means) :
        print("--- Printing Cluster " + str(idx) + " ---")
        
        for idx2, item in enumerate(cluster_mean) :
            print(": ".join([var_labels[idx2],str(round(item,2))]))

        print("----------------------\n")


defaultdict(<class 'list'>,
            {'Arave': [249.0, 249.0, 7.0, 0.0, 0.0, 2.0],
             'Berens': [929.0, 5.0, 7.0, 0.0, 0.0, 2.0],
             'Chandler': [2169.0, 2169.0, 10.0, 0.0, 0.0, 2.0],
             'Dezihan': [1166.0, 210.0, 5.0, 1.0, 1.0, 4.0],
             'Diehl': [4568.0, 6.0, 4.0, 1.0, 1.0, 3.0],
             'Flesch': [114.0, 222.0, 5.0, 1.0, 1.0, 5.0],
             'Freyn': [1600.0, 1600.0, 4.0, 0.0, 1.0, 4.0],
             'Grant': [271.0, 268.0, 8.0, 0.0, 1.0, 2.0],
             'Hansen': [625.0, 625.0, 10.0, 0.0, 0.0, 2.0],
             'Harper': [115.0, 115.0, 5.0, 1.0, 1.0, 2.0],
             'Jambor': [391.0, 92.0, 5.0, 1.0, 1.0, 1.0],
             'Kassner': [1743.0, 5.0, 5.0, 0.0, 0.0, 3.0],
             'Khormali': [6600.0, 6600.0, 10.0, 0.0, 0.0, 2.0],
             'Kolberg': [2132.0, 2.0, 5.0, 0.0, 0.0, 3.0],
             'Layton': [128.0, 147.0, 7.0, 1.0, 1.0, 3.0],
             'Makris': [187.0, 191.0, 5.0, 1.0, 1.0, 4.0],
             'Marbut'

We'll pause here for a second to talk about the data. 

In [7]:
# Now, let's explore some clusters. Try different values of
# k and see what emerges

k = 4

assignments, means = clustering_code.train_dict(student_data, k)

# Sorted version
s_assign = ( (k ,assignments[k]) for k in sorted(assignments, key=assignments.get, reverse=False))
print( str(k) + "-means:")
for student, cluster in s_assign :
    print(str(cluster) + " : " + student)


4-means:
0 : Chandler
0 : Diehl
0 : Wiener
0 : Spoja
1 : Persico
1 : Freyn
1 : Milligan
1 : Kassner
1 : Kolberg
1 : Murphy
1 : Norman
1 : Dezihan
1 : Berens
1 : Murray
2 : Hansen
2 : Sliwinski
2 : Harper
2 : curnow
2 : Primm
2 : Flesch
2 : Jambor
2 : Makris
2 : Arave
2 : Grant
2 : Sicheri
2 : Marbut
2 : Ray
2 : Layton
3 : Nakajima
3 : Zor
3 : Khormali
3 : Yang


The clustering algorithm also returns the means of the clusters. How do we interpret these? 

In [8]:
pprint_means(means)

--- Printing Cluster 0 ---
Birth Dist: 3384.5
Age 15 Dist: 1061.0
Post-Secondary: 6.5
Mkt Major: 0.25
Biz Major: 0.25
HH Size: 2.75
----------------------

--- Printing Cluster 1 ---
Birth Dist: 1281.8
Age 15 Dist: 470.2
Post-Secondary: 5.4
Mkt Major: 0.3
Biz Major: 0.7
HH Size: 3.0
----------------------

--- Printing Cluster 2 ---
Birth Dist: 333.21
Age 15 Dist: 382.07
Post-Secondary: 6.14
Mkt Major: 0.5
Biz Major: 0.79
HH Size: 2.79
----------------------

--- Printing Cluster 3 ---
Birth Dist: 6005.75
Age 15 Dist: 6005.75
Post-Secondary: 6.5
Mkt Major: 0.25
Biz Major: 0.5
HH Size: 2.0
----------------------



In [11]:
# Here's a place where we'll do some work rescaling the data.

# maybe start by getting largest miles.
miles = []
for k in student_data :
    miles.extend(student_data[k][:2])


In [13]:
max_miles = max(miles)

Once you've rescaled the data, play around with some other clustering results. What emerges?