In [1]:
import time
import pandas as pd
import numpy as np
import itertools

start = time.time()

full_tutors = pd.read_csv("tutors.csv")
full_students = pd.read_csv("students.csv")

In [2]:
# minor preprocessing of original dataframes for better organization later
full_tutors = full_tutors.rename(columns={'Name' : 'Tutor'})
full_students = full_students.rename(columns={'Student Name - Last, First': 'Student'})
# full_tutors and full_students dataframes will not be mutated after this

In [3]:
# split students into the 4 priorities 
application = 'Previous application with TMC'
freeLunch = 'Does the student qualify for the free or reduced lunch program?'
prevTutored = 'Yes, my student has previously applied and received lessons with TMC'
prevApplied = 'Yes, my student has applied for lessons with TMC before but was not placed for lessons'
newStudent = 'No, this is my student\'s first time applying for lessons with TMC'

students1 = full_students.loc[(full_students[application] == prevTutored)]
students2 = full_students.loc[(full_students[application] == prevApplied) & (full_students[freeLunch] == 'Yes')]
students3 = full_students.loc[(full_students[application] == newStudent) & (full_students[freeLunch] == 'Yes')]
students4 = full_students.loc[(full_students[application] == prevApplied) & (full_students[freeLunch] == 'No')]
students5 = full_students.loc[(full_students[application] == newStudent) & (full_students[freeLunch] == 'No')]

In [4]:
# more useful arrays
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
times = ['Before 11am', '11am- 1pm', '1pm- 3pm', '3pm- 5pm', "After 5pm"]
daystimes = [day + ' ' + time for day in days for time in times]
choices = ['Instrument (First Choice)', 'Instrument (Second Choice)', 'Instrument (Third Choice)']

In [5]:
"""
Function to preprocess the given dataframe.
Arguments: data - raw dataframe (read from tutors.csv or students.csv)
           status - 'students' or 'tutors'
Returns: new dataframe with only the necessary columns for matching
"""
def preprocess(data, status):
    # filter out irrelevant columns, remove/replace invalid data entries
    data = data.replace(np.nan, '')
    if status == 'students':
        data = data[['Student'] + choices + days]
        # consolidate all instruments into one column, in the order of preference
        data['Instrument'] = data[choices].agg(', '.join, axis=1)
        data = data.drop(choices, 1)
    elif status == 'tutors':
        data = data[['Tutor', 'Which instrument(s) would you like to teach?'] + days]
        data = data.rename(columns={'Which instrument(s) would you like to teach?': 'Instrument'})
        
    # standardize instrument name to be lowercase
    data['Instrument'] = data['Instrument'].str.lower()
    
    # extract day/time availabilities into separate columns
    index = 0
    for i in range(len(days)):
        for j in range(len(times)):
            d = days[i]
            t = times[j]
            data[daystimes[index]] = data[d].str.contains(t)
            index += 1
    
    # keep only day, time combination columns in dataframe
    data = data.drop(days, 1)
    
    # add column for number of availabilities
    temp = data[daystimes]
    temp['Count'] = temp.sum(1)
    data['Count'] = temp['Count']
    data = data.sort_values(by='Count')
    
    return data

In [6]:
# create dataframes to use for matching
pd.set_option('mode.chained_assignment', None)
tutors = preprocess(full_tutors, 'tutors')
# students = preprocess(full_students, 'students')

students1 = preprocess(students1, 'students')
students2 = preprocess(students2, 'students')
students3 = preprocess(students3, 'students')
students4 = preprocess(students4, 'students')
students5 = preprocess(students5, 'students')

# all students, in order of priority level and least to most available
students = pd.concat([students1, students2, students3, students4, students5])

In [7]:
# copy to mutate in function; tutors will be removed once they are matched
tutors2 = tutors.copy()

In [8]:
"""
Takes in a student in the form of a tuple with (length, Series) -- values from a df.iterrows() iterable.
Returns a list of matched tutor, instrument, and time slots, or a list of Nones if no match can be created.
Matches are considered valid if there is an instrument overlap between student and tutor, and at least one time availability overlap.
"""
def match(student):
    student_info = student[1]
    # remove empty strings from student's instruments choice (i.e. if student has less than 3 preferences)
    student_instruments = list(filter(None, student_info['Instrument'].split(', ')))
    
    t_without_count = tutors2.drop(columns='Count')
    
    # iterate through student's instruments, beginning with first choice
    for instrument in student_instruments:
        # iterate through all remaining tutors, searching for a match with specific instrument choice
        tutors_iter = t_without_count.iterrows()
        for tutor in tutors_iter:
            tutor_info = tutor[1]
            tutor_instruments = tutor_info['Instrument']
            # merge into one Series with values as lists of [tutor value, student value]
            combined_info = tutor_info.combine(student_info, lambda x, y: [x, y])
            # times where both tutor and student are available
            times = [t for t in combined_info.index if combined_info[t] == [True, True]]
            
            # if instrument matches and there is at least one shared time availability
            if instrument in tutor_instruments and len(times) > 0:
                return [tutor_info[0], instrument, times]
    
    # no match found for any of the three instrument choices
    return [None, None, None]

In [9]:
# create the iterator from rows of students
s_without_count = students.drop(columns='Count')
students_iter = s_without_count.iterrows()

In [10]:
# create the dataframe to store matches once made
matches = pd.DataFrame(columns=['Student', 'Tutor', 'Instrument', 'Time(s)'])

In [11]:
# iterate through students; use function defined above to get a match
for s in students_iter:
    m = match(s)
    # add to matches dataframe
    matches.loc[len(matches)] = [s[1][0]] + m
    # remove tutor if successfully matched
    if m[0] is not None:
        tutors2 = tutors2.drop(tutors2[tutors2['Tutor'] == m[0]].index)

In [12]:
# matched students
matched = matches[matches['Tutor'].notna()]

In [13]:
# unmatched students
no_match = matches[matches['Tutor'].isna()]

In [14]:
# unmatched tutors
tutors2[['Tutor', 'Instrument', 'Count']]

Unnamed: 0,Tutor,Instrument,Count
15,sad,guitar,4
5,Ahmed Baqai,guitar,10


In [15]:
# matches with relevant information (add any columns needed)
matched = matched.merge(full_students[['Student', 'Email Address']], on='Student', how='left')
matched = matched.merge(full_tutors[['Tutor', 'Email']], on='Tutor', how='left')
matched = matched.rename(columns={'Email Address' : 'Student Email', 'Email' : 'Tutor Email'})
matched

Unnamed: 0,Student,Tutor,Instrument,Time(s),Student Email,Tutor Email
0,"Mosey, Lil",zz@gmail.com,piano,[Tuesday After 5pm],lm@gmail.com,saads@gmail.com
1,"berkelee, berk",James McDonald,cello,"[Saturday Before 11am, Thursday Before 11am, W...",blahblah@berkeley.edu,ahmed.baqai@gmail.com
2,"Glover, Donald",Lizzy,bass,[Sunday After 5pm],gg@gmail.com,safdsa@gmail.com
3,"Pitt, Bradasdf",Rohan,flute,[Monday Before 11am],ahmed.baqai@gmail.com,asafs@gmail.com
4,"Bob, Alice",Ryan Nadeem,violin,"[Friday Before 11am, Monday Before 11am]",123movies@gmail.com,ahmed.baqai@gmail.com
5,"Lee, Ender",Arnav Rao,trumpet,"[Monday After 5pm, Wednesday After 5pm]",enderlee2006@yahoo.com,ahmed.baqai@gmail.com
6,"Pitt, Brad",Tony Stark,flute,"[Monday After 5pm, Monday Before 11am, Sunday ...",ahmed.baqai@gmail.com,ahmed.baqai@gmail.com
7,"Stalin, Joseph",Bob the Builder,clarinet,"[Saturday After 5pm, Sunday After 5pm, Tuesday...",jstalin@gmail.com,yeswecan@aol.com
8,"cjh, asdkfjh",askfhalkjh,clarinet,[Tuesday After 5pm],prims@aol.com,asjdkhfk@jkasdhf.com
9,"Kardashian, Khloe",Jose,piano,[Friday After 5pm],tyg@gmail.com,sasaas2@gmail.com


In [16]:
# students with no matches with relevant information (add any columns needed)
no_match = no_match.merge(full_students[['Student', 'Email Address']], on='Student', how='left')
no_match = no_match.rename(columns={'Email Address' : 'Student Email'}).drop(['Tutor', 'Instrument', 'Time(s)'], 1)
no_match

Unnamed: 0,Student,Student Email
0,"quarantine, help",coron@virus.org
1,"A, B",helpme@yahoo.com
2,"Lopez, Jennifer",yhs@gmail.com
3,"Chan, Jackie",ahmed.baqai@gmail.com
4,"Brown, Stacy",ahmed.baqai@gmail.com
5,"Roberts, George",ffs@gmail.com
6,"Charles, James",ahmed.baqai@gmail.com
7,"Charlesjhfas, James",ahmed.baqai@gmail.com
8,"Dev, Neil",ahmed.baqai@gmail.com
9,"Patel, Adisdf",ahmed.baqai@gmail.com


In [17]:
# time for computation (in seconds):
time.time() - start

4.6782073974609375