In [155]:
#lin alg
import numpy as np

#csv IOs and dataframes
import pandas as pd

#clustering
from sklearn.cluster import KMeans
from FixedKmeans import same_size_kmeans

import matplotlib.pyplot as plt

%matplotlib notebook

## Same-Sized K-means or Classic K-means

When performing K-means, we can choose to run either the traditional algorithm, which does not guarantee clusters of the same-size, or the same-size version, which gives equal sizes at the cost of fit

Since ensuring UPE families are similar sizes is a high priority, I recommend using the same-size version. Using the classic version can be achieved using non-preference-based size balancing at the end of execution.

To use same-sized K-means, change ```same_size_k``` to ```True```. To use size-balancing for classic K-means, change ```balance_classic``` to ```True```.

Adjust the number of families using the ```num_families``.

In [156]:
same_size_k = True
balance_classic = True

num_families = 6

## Data Cleaning

In [157]:
#This can be a relative or absolute path. In actual work it will very highly on your envoirment
df = pd.read_csv('family_matching.csv') 

#initialize sets to exlude or drop later
drop_cols = set([]) 
id_cols = set([])

In [158]:
#get shape of survey data
df.shape

(40, 46)

#### Print out the full csv

In [159]:
df

Unnamed: 0,Timestamp,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,2021/02/02 7:49:10 PM EST,linsyw@bu.edu,Linsy,Wang,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,No,No,Yes,No,No,Yes,Yes,No,No
1,2021/02/02 7:49:12 PM EST,pkuzdzal@bu.edu,Patrick,Kuzdzal,No,Yes,Yes,Yes,No,No,...,Yes,No,Yes,No,No,No,Yes,No,No,No
2,2021/02/02 7:49:14 PM EST,emcateer@bu.edu,Erin,McAteer,Yes,Yes,Yes,Yes,Yes,No,...,Yes,No,Yes,No,No,No,Yes,No,No,No
3,2021/02/02 7:49:19 PM EST,vitor@bu.edu,Victor,Vicente,Yes,Yes,No,Yes,Yes,Yes,...,Yes,No,No,No,No,Yes,Yes,No,Yes,No
4,2021/02/02 7:49:22 PM EST,srk22@bu.edu,Snigdha,Kalathur,Yes,No,Yes,Yes,Yes,No,...,No,No,No,No,No,No,Yes,No,No,Yes
5,2021/02/02 7:49:30 PM EST,johnbest@bu.edu,John,Bestavros,Yes,Yes,No,Yes,No,Yes,...,No,No,Yes,No,No,No,Yes,No,Yes,Yes
6,2021/02/02 7:49:45 PM EST,cewalsh@bu.edu,Conor,Walsh,Yes,No,No,Yes,Yes,Yes,...,Yes,No,Yes,No,No,No,Yes,No,No,No
7,2021/02/02 7:50:06 PM EST,sid15@bu.edu,Siddhant,Kothari,Yes,Yes,No,Yes,Yes,No,...,No,No,No,No,No,Yes,Yes,Yes,No,No
8,2021/02/02 7:50:08 PM EST,zwan1312@bu.edu,Zhenghui,Wang,Yes,No,No,Yes,Yes,No,...,Yes,No,Yes,No,No,No,Yes,No,Yes,No
9,2021/02/02 7:50:11 PM EST,shkim219@bu.edu,Seonghoo (Paul),Kim,Yes,No,Yes,Yes,Yes,No,...,Yes,Yes,Yes,No,No,No,No,Yes,No,Yes


In [160]:
df.head()

Unnamed: 0,Timestamp,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,2021/02/02 7:49:10 PM EST,linsyw@bu.edu,Linsy,Wang,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,No,No,Yes,No,No,Yes,Yes,No,No
1,2021/02/02 7:49:12 PM EST,pkuzdzal@bu.edu,Patrick,Kuzdzal,No,Yes,Yes,Yes,No,No,...,Yes,No,Yes,No,No,No,Yes,No,No,No
2,2021/02/02 7:49:14 PM EST,emcateer@bu.edu,Erin,McAteer,Yes,Yes,Yes,Yes,Yes,No,...,Yes,No,Yes,No,No,No,Yes,No,No,No
3,2021/02/02 7:49:19 PM EST,vitor@bu.edu,Victor,Vicente,Yes,Yes,No,Yes,Yes,Yes,...,Yes,No,No,No,No,Yes,Yes,No,Yes,No
4,2021/02/02 7:49:22 PM EST,srk22@bu.edu,Snigdha,Kalathur,Yes,No,Yes,Yes,Yes,No,...,No,No,No,No,No,No,Yes,No,No,Yes


Timestamps are not relevant to our family matching so we will just drop this column.

In [161]:
#add timestamp column to drop_cols set

drop_cols.add('Timestamp')

#### Our first three columns are all identifier data

In [162]:
df[['Username', 'First Name', 'Last Name']]

Unnamed: 0,Username,First Name,Last Name
0,linsyw@bu.edu,Linsy,Wang
1,pkuzdzal@bu.edu,Patrick,Kuzdzal
2,emcateer@bu.edu,Erin,McAteer
3,vitor@bu.edu,Victor,Vicente
4,srk22@bu.edu,Snigdha,Kalathur
5,johnbest@bu.edu,John,Bestavros
6,cewalsh@bu.edu,Conor,Walsh
7,sid15@bu.edu,Siddhant,Kothari
8,zwan1312@bu.edu,Zhenghui,Wang
9,shkim219@bu.edu,Seonghoo (Paul),Kim


We will store this data separately for use in identifying our matching results later, but it will not be run through k-means.

In [163]:
id_cols = set(['Username', 'First Name', 'Last Name'])

In [164]:
# Lets see if we have every grade here:
df['Do you like vanilla ice cream?'].unique()

array(['Yes', 'No'], dtype=object)

In [165]:
df

Unnamed: 0,Timestamp,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,2021/02/02 7:49:10 PM EST,linsyw@bu.edu,Linsy,Wang,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,No,No,Yes,No,No,Yes,Yes,No,No
1,2021/02/02 7:49:12 PM EST,pkuzdzal@bu.edu,Patrick,Kuzdzal,No,Yes,Yes,Yes,No,No,...,Yes,No,Yes,No,No,No,Yes,No,No,No
2,2021/02/02 7:49:14 PM EST,emcateer@bu.edu,Erin,McAteer,Yes,Yes,Yes,Yes,Yes,No,...,Yes,No,Yes,No,No,No,Yes,No,No,No
3,2021/02/02 7:49:19 PM EST,vitor@bu.edu,Victor,Vicente,Yes,Yes,No,Yes,Yes,Yes,...,Yes,No,No,No,No,Yes,Yes,No,Yes,No
4,2021/02/02 7:49:22 PM EST,srk22@bu.edu,Snigdha,Kalathur,Yes,No,Yes,Yes,Yes,No,...,No,No,No,No,No,No,Yes,No,No,Yes
5,2021/02/02 7:49:30 PM EST,johnbest@bu.edu,John,Bestavros,Yes,Yes,No,Yes,No,Yes,...,No,No,Yes,No,No,No,Yes,No,Yes,Yes
6,2021/02/02 7:49:45 PM EST,cewalsh@bu.edu,Conor,Walsh,Yes,No,No,Yes,Yes,Yes,...,Yes,No,Yes,No,No,No,Yes,No,No,No
7,2021/02/02 7:50:06 PM EST,sid15@bu.edu,Siddhant,Kothari,Yes,Yes,No,Yes,Yes,No,...,No,No,No,No,No,Yes,Yes,Yes,No,No
8,2021/02/02 7:50:08 PM EST,zwan1312@bu.edu,Zhenghui,Wang,Yes,No,No,Yes,Yes,No,...,Yes,No,Yes,No,No,No,Yes,No,Yes,No
9,2021/02/02 7:50:11 PM EST,shkim219@bu.edu,Seonghoo (Paul),Kim,Yes,No,Yes,Yes,Yes,No,...,Yes,Yes,Yes,No,No,No,No,Yes,No,Yes


In [166]:
# We will convert all of our 'No'/'Yes' values to 0/1 values
for question in df:
    if(not question in id_cols and not question in drop_cols):
        df[question] = df[question].apply(lambda x: 0 if x=='No' else 1)

In [167]:
df

Unnamed: 0,Timestamp,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,2021/02/02 7:49:10 PM EST,linsyw@bu.edu,Linsy,Wang,1,1,1,1,1,1,...,1,0,0,1,0,0,1,1,0,0
1,2021/02/02 7:49:12 PM EST,pkuzdzal@bu.edu,Patrick,Kuzdzal,0,1,1,1,0,0,...,1,0,1,0,0,0,1,0,0,0
2,2021/02/02 7:49:14 PM EST,emcateer@bu.edu,Erin,McAteer,1,1,1,1,1,0,...,1,0,1,0,0,0,1,0,0,0
3,2021/02/02 7:49:19 PM EST,vitor@bu.edu,Victor,Vicente,1,1,0,1,1,1,...,1,0,0,0,0,1,1,0,1,0
4,2021/02/02 7:49:22 PM EST,srk22@bu.edu,Snigdha,Kalathur,1,0,1,1,1,0,...,0,0,0,0,0,0,1,0,0,1
5,2021/02/02 7:49:30 PM EST,johnbest@bu.edu,John,Bestavros,1,1,0,1,0,1,...,0,0,1,0,0,0,1,0,1,1
6,2021/02/02 7:49:45 PM EST,cewalsh@bu.edu,Conor,Walsh,1,0,0,1,1,1,...,1,0,1,0,0,0,1,0,0,0
7,2021/02/02 7:50:06 PM EST,sid15@bu.edu,Siddhant,Kothari,1,1,0,1,1,0,...,0,0,0,0,0,1,1,1,0,0
8,2021/02/02 7:50:08 PM EST,zwan1312@bu.edu,Zhenghui,Wang,1,0,0,1,1,0,...,1,0,1,0,0,0,1,0,1,0
9,2021/02/02 7:50:11 PM EST,shkim219@bu.edu,Seonghoo (Paul),Kim,1,0,1,1,1,0,...,1,1,1,0,0,0,0,1,0,1


### Delete drop columns

In [168]:
df.drop(drop_cols, axis=1,inplace=True)

In [169]:
df

Unnamed: 0,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,Do you like Javascript?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,linsyw@bu.edu,Linsy,Wang,1,1,1,1,1,1,0,...,1,0,0,1,0,0,1,1,0,0
1,pkuzdzal@bu.edu,Patrick,Kuzdzal,0,1,1,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
2,emcateer@bu.edu,Erin,McAteer,1,1,1,1,1,0,0,...,1,0,1,0,0,0,1,0,0,0
3,vitor@bu.edu,Victor,Vicente,1,1,0,1,1,1,1,...,1,0,0,0,0,1,1,0,1,0
4,srk22@bu.edu,Snigdha,Kalathur,1,0,1,1,1,0,1,...,0,0,0,0,0,0,1,0,0,1
5,johnbest@bu.edu,John,Bestavros,1,1,0,1,0,1,1,...,0,0,1,0,0,0,1,0,1,1
6,cewalsh@bu.edu,Conor,Walsh,1,0,0,1,1,1,1,...,1,0,1,0,0,0,1,0,0,0
7,sid15@bu.edu,Siddhant,Kothari,1,1,0,1,1,0,1,...,0,0,0,0,0,1,1,1,0,0
8,zwan1312@bu.edu,Zhenghui,Wang,1,0,0,1,1,0,1,...,1,0,1,0,0,0,1,0,1,0
9,shkim219@bu.edu,Seonghoo (Paul),Kim,1,0,1,1,1,0,0,...,1,1,1,0,0,0,0,1,0,1


# Inferencing

#### Seperate the data from the ids for now

In [170]:
X, X_data = df, df.drop(id_cols, axis=1)

In [171]:
X

Unnamed: 0,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,Do you like Javascript?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,linsyw@bu.edu,Linsy,Wang,1,1,1,1,1,1,0,...,1,0,0,1,0,0,1,1,0,0
1,pkuzdzal@bu.edu,Patrick,Kuzdzal,0,1,1,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
2,emcateer@bu.edu,Erin,McAteer,1,1,1,1,1,0,0,...,1,0,1,0,0,0,1,0,0,0
3,vitor@bu.edu,Victor,Vicente,1,1,0,1,1,1,1,...,1,0,0,0,0,1,1,0,1,0
4,srk22@bu.edu,Snigdha,Kalathur,1,0,1,1,1,0,1,...,0,0,0,0,0,0,1,0,0,1
5,johnbest@bu.edu,John,Bestavros,1,1,0,1,0,1,1,...,0,0,1,0,0,0,1,0,1,1
6,cewalsh@bu.edu,Conor,Walsh,1,0,0,1,1,1,1,...,1,0,1,0,0,0,1,0,0,0
7,sid15@bu.edu,Siddhant,Kothari,1,1,0,1,1,0,1,...,0,0,0,0,0,1,1,1,0,0
8,zwan1312@bu.edu,Zhenghui,Wang,1,0,0,1,1,0,1,...,1,0,1,0,0,0,1,0,1,0
9,shkim219@bu.edu,Seonghoo (Paul),Kim,1,0,1,1,1,0,0,...,1,1,1,0,0,0,0,1,0,1


In [172]:
X_data

Unnamed: 0,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,Do you like Javascript?,Do you like playing sports?,Do your hobbies fall within the arts?,Do you do a lot of outdoor activities?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,1,1,1,1,1,1,0,1,1,1,...,1,0,0,1,0,0,1,1,0,0
1,0,1,1,1,0,0,0,1,0,1,...,1,0,1,0,0,0,1,0,0,0
2,1,1,1,1,1,0,0,1,0,1,...,1,0,1,0,0,0,1,0,0,0
3,1,1,0,1,1,1,1,0,0,0,...,1,0,0,0,0,1,1,0,1,0
4,1,0,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,1,0,0,1
5,1,1,0,1,0,1,1,0,0,0,...,0,0,1,0,0,0,1,0,1,1
6,1,0,0,1,1,1,1,1,0,0,...,1,0,1,0,0,0,1,0,0,0
7,1,1,0,1,1,0,1,1,0,1,...,0,0,0,0,0,1,1,1,0,0
8,1,0,0,1,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,1,0
9,1,0,1,1,1,0,0,1,0,1,...,1,1,1,0,0,0,0,1,0,1


### Kmeans perfromed using the following [api](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)

In [173]:
if(not same_size_k):
    model = KMeans(n_clusters=6, random_state=0).fit(X_data)
else:
    model = same_size_kmeans(X_data, k=6, size_flexibility=0, max_iter=10)

In [174]:
#Add labels back to the original data 
if(not same_size_k):
    model.labels_

In [175]:
if(not same_size_k):
    X['Labels'] = model.labels_
else:
    X['Labels'] = model.cluster_id

In [176]:
X

Unnamed: 0,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,Do you like Javascript?,...,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.,Labels
0,linsyw@bu.edu,Linsy,Wang,1,1,1,1,1,1,0,...,0,0,1,0,0,1,1,0,0,2
1,pkuzdzal@bu.edu,Patrick,Kuzdzal,0,1,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,2
2,emcateer@bu.edu,Erin,McAteer,1,1,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,vitor@bu.edu,Victor,Vicente,1,1,0,1,1,1,1,...,0,0,0,0,1,1,0,1,0,2
4,srk22@bu.edu,Snigdha,Kalathur,1,0,1,1,1,0,1,...,0,0,0,0,0,1,0,0,1,5
5,johnbest@bu.edu,John,Bestavros,1,1,0,1,0,1,1,...,0,1,0,0,0,1,0,1,1,2
6,cewalsh@bu.edu,Conor,Walsh,1,0,0,1,1,1,1,...,0,1,0,0,0,1,0,0,0,4
7,sid15@bu.edu,Siddhant,Kothari,1,1,0,1,1,0,1,...,0,0,0,0,1,1,1,0,0,4
8,zwan1312@bu.edu,Zhenghui,Wang,1,0,0,1,1,0,1,...,0,1,0,0,0,1,0,1,0,4
9,shkim219@bu.edu,Seonghoo (Paul),Kim,1,0,1,1,1,0,0,...,1,1,0,0,0,0,1,0,1,4


### We're left with unbalanced families, so we'll balance their sizes by running a while loop where we give a member from the largest family to the smallest family until all families are within one member of each other in size

### Once we're done, we're left with the final families

In [177]:
families = {}
families['Euclid'] = []
families['Prim'] = []
families['Hopper'] = []
families['Pascal'] = []
families['Boole'] = []
families['Djikstra'] = []

for i in range(len(X['Labels'])):
    if(X['Labels'][i] == 0):
        families['Euclid'].append(X['First Name'][i] + ' ' + X['Last Name'][i])
    if(X['Labels'][i] == 1):
        families['Prim'].append(X['First Name'][i] + ' ' + X['Last Name'][i])
    if(X['Labels'][i] == 2):
        families['Hopper'].append(X['First Name'][i] + ' ' + X['Last Name'][i])
    if(X['Labels'][i] == 3):
        families['Pascal'].append(X['First Name'][i] + ' ' + X['Last Name'][i])
    if(X['Labels'][i] == 4):
        families['Boole'].append(X['First Name'][i] + ' ' + X['Last Name'][i])
    if(X['Labels'][i] == 5):
        families['Djikstra'].append(X['First Name'][i] + ' ' + X['Last Name'][i])

if(not same_size_k):
    if(balance_classic):
        not_balanced = True    

        while(not_balanced):

            largest_fam = ''
            largest_fam_size = 0

            smallest_fam = ''
            smallest_fam_size = 99

            for family in families:
                if(len(families[family]) > largest_fam_size):
                    largest_fam_size = len(families[family])
                    largest_fam = family
                if(len(families[family]) < smallest_fam_size):
                    smallest_fam_size = len(families[family])
                    smallest_fam = family

            if(largest_fam_size - smallest_fam_size > 1):
                member_to_move = families[largest_fam].pop()
                print("move " + member_to_move + " from " + largest_fam + " to " + smallest_fam)
                families[smallest_fam].append(member_to_move)
            else:
                not_balanced = False

        print()
        
for family in families:
    print(family)
    print(families[family])
    print()

Euclid
['Erin McAteer', 'David Sullo', 'Anand Shetler', 'Justin Sayah', 'Cali Dolfi', 'Nina Athma', 'Qing Liu']

Prim
['Dominic Maglione', 'Melissa Lopez', 'Matt Henriksen', 'Evan Hsu', 'Nathan Ho', 'Tilak Agarwal']

Hopper
['Linsy Wang', 'Patrick Kuzdzal', 'Victor Vicente', 'John Bestavros', 'Benji Spetter-Goldstein', 'Keshav Maheshwari', 'Vivian Gunawan']

Pascal
['Gabriel Moncau', 'John Bolognino', 'Amy Feng', 'Carlos Lopez', 'Delaine Rogers', 'Savannah Cardenas', 'Rani Shah']

Boole
['Conor Walsh', 'Siddhant  Kothari', 'Zhenghui  Wang', 'Seonghoo (Paul) Kim', 'Kradon Zhao', 'Eren Budur', 'Jana Mikaela Aguilar']

Djikstra
['Snigdha Kalathur', 'Anming Gu', 'Francis Pacini', 'Priya Kumari', 'Noah Jean-Baptiste', 'Joshua  Pei']

