In [1]:
import pandas as pd
import numpy as np
import random

In [None]:
"""
Need to implement a random seed for control purposes
In order to really test the effectiveness of check_for_missed() we need to analyze the % of 
matches that are being met per students
The problem with those being missed is two-fold:
(1) They are those with no matches within the top 3 POSs, lack of offering by school 
(2) Their top POSs are not in high demand, which means they would be prioritized for lower level matches,
or just missed multiple times

(2, 6, 9)
2 students are missed both in B1, and B2 despite check_for_misses() 
6 students are missed both in B2, and B3
9 students are missed both in B3, and B4
This could be in part that they got prioritized for a previous match and just have a lot of holes in their 
POS based on what the school offers. 
1 student got missed in all blocks, but they have a completely missed POS. This indicates they took YS, 
but the school did not offer any POS that corresponded with their YS matches. 

"""

In [None]:
"""
Goal: 
- Create a rudimentary matching algorithm 
- Note the process and decisions made here: 
Decisions: 
- Scope of Rough Draft: work for one school, then focus on generalizing functions/script
- Want to track the number of clusters that were dropped due to no matching POS per school
"""

In [None]:
# (1) Need to read in by_school match csv and tally the demand of YS clusters per school
# (2) Need to convert YS clusters into POSs per school
# (3) Need to determine capacity vector for school

In [2]:
schools = ['Oakland Middle School',
    'Siegel Middle School',
    'Whitworth-Buchanan Middle School',
    'Christiana Middle School',
    'Smyrna Middle School',
    'Stewarts Creek Middle School',
    'Rockvale Middle School',
    'Rocky Fork Middle School',
    'Blackman Middle School',
    'Thurman Francis Arts Academy',
    'Rock Springs Middle School',
    'LaVergne Middle School'
]

In [3]:
# checking volume vs. capacity per school
capacity_report = {
    'School':[],
    'Status':[],
    'Assigned Capacity':[],
    '8th Graders':[],
    'Number of Large Rooms':[],
    'Number of Small Rooms':[]
}


for school in schools:
    # read in planning document to get list of rooms
    if school == 'Whitworth-Buchanan Middle School':
        plan_df = pd.read_excel('planning.xlsx', sheet_name='Whitworth-Buchanan Middle Schoo')
    else:
        plan_df = pd.read_excel('planning.xlsx', sheet_name=school)
    # get the rooms per that school
    rooms = list(plan_df['MS Room #'])
    capacity = 0
    lg_rooms = 0
    sm_rooms = 0
    for room in rooms:
        if room in ['Library', 'Auditorium']:
            capacity += 50
            lg_rooms += 1
        else:
            capacity += 35
            sm_rooms += 1

    # determine volume from YS match roster
    path = f'YS_Criteria_by_School/{school} YSCriteria.csv'
    ys_df = pd.read_csv(path)
    volume = len(ys_df)

    # compile dictionary for report
    capacity_report['School'].append(school)
    if capacity >= volume:
        capacity_report['Status'].append('Ready')
    else:
        capacity_report['Status'].append('Insufficient Space')
    capacity_report['Assigned Capacity'].append(capacity)
    capacity_report['8th Graders'].append(volume)
    capacity_report['Number of Large Rooms'].append(lg_rooms)
    capacity_report['Number of Small Rooms'].append(sm_rooms)
    

cap_df = pd.DataFrame(capacity_report)

In [4]:
cap_df['Status'] = cap_df['Assigned Capacity'] > cap_df['8th Graders']
cap_df

Unnamed: 0,School,Status,Assigned Capacity,8th Graders,Number of Large Rooms,Number of Small Rooms
0,Oakland Middle School,True,680,420,1,18
1,Siegel Middle School,True,555,430,2,13
2,Whitworth-Buchanan Middle School,True,625,322,2,15
3,Christiana Middle School,True,595,376,0,17
4,Smyrna Middle School,True,540,321,1,14
5,Stewarts Creek Middle School,True,555,386,2,13
6,Rockvale Middle School,True,575,537,1,15
7,Rocky Fork Middle School,True,540,276,1,14
8,Blackman Middle School,False,525,617,0,15
9,Thurman Francis Arts Academy,True,525,86,0,15


In [5]:
def get_POS_from_clusters(school):
    # read in YouScience Matching Df
    ys_path = f'YS_Criteria_by_School/{school} YSCriteria.csv'
    ys_match_df = pd.read_csv(ys_path)
    # Read in appropriate columns form direct_join_prepared.xlsx
    dj_path = f'../direct_join_prepared.xlsx'
    djp = pd.read_excel(dj_path)
    djp = djp[['YouScience Clusters',school]]

    # create replacement dictionary
    to_replace = {}
    for i in range(len(djp)):
        cluster = djp.iloc[i]
        # key = YS cluster from 'YouScience Clusters' column, value = school's coresponding POS from {school} column
        if type(cluster[school]) != float:
            to_replace[cluster['YouScience Clusters']] = cluster[school]
        else:
            to_replace[cluster['YouScience Clusters']] = 0
    
    # at some point '0' is introduced somewhere. I suspect from the added positions in updated rosters
    to_replace['0'] = 0

    return ys_match_df.replace(to_replace=to_replace).drop('Unnamed: 0', axis=1)


In [6]:
# need to be careful not to read a grouping of POS's as a single POS
# groupings will be ', ' seperated 
def get_POS_demand(school_pos_df):

    from collections import Counter
    C = Counter()
    for i in range(len(school_pos_df)):
        # read in student's POS
        student = school_pos_df.iloc[i]
        # compile list of POS
        pos_list = []
        for rank in ['First','Second','Third','Fourth','Fifth','Sixth']:
            if student[rank] not in pos_list:
                pos_list.append(student[rank])
        # update counter
        C.update(pos_list)
    
    # checking for problems, assumes cap_df exists
    lg_rooms = cap_df.loc[cap_df.School == school]['Number of Large Rooms'].values[0]
    sm_rooms = cap_df.loc[cap_df.School == school]['Number of Small Rooms'].values[0]
    print(f'{school} registered {lg_rooms} large classroom(s) available.')
    need_lg = 0
    for i in C.most_common():
        if i[0] == 0:
            continue
        if lg_rooms > 0:
            check = i[1] <= 200
            lg_rooms -= 1
        else:
            check = i[1] <= 140
        
        if check == False:
            need_lg += 1
    print(f'{school} needs {need_lg} additional large rooms to meet POS demand.')
        
    return C

In [7]:
# dictionary for demand by school 
school_demand = {}

for school in schools:
    # Read in school's YS matches and convert them to POS's, replace nan with 0s
    school_pos_df = get_POS_from_clusters(school)
    # Want to calculate POS demand to assess any issues with allocation of rooms
    school_demand[school] = get_POS_demand(school_pos_df).most_common()
    # Need to execute matching here -OR- export pos_df into usable format for later
    break

Oakland Middle School registered 1 large classroom(s) available.
Oakland Middle School needs 2 additional large rooms to meet POS demand.


In [None]:
# making sure ids are unique (420, 420)
len(school_pos_df), len(school_pos_df.id.unique())

# Match Philosophy

I think it makes the most sense to iterate through the highest demanded Pos choices first. While the most commonly selected POS does not mean it was the most commonly picked first choice, it does provide a reasonable stand point for iteration. There is an obvious demand for the POS in question and we can prioritize those that picked it for their first and second choices, but we must start somewhere. 

## Handling Coupled POS

The second largest problem I see is that some YS clusters are mapped to a group of POS options rather than 1-1. For instance, some students might have matched with the 'POS' <i>"Sport & Human Performance, Veterinary & Animal Science"</i>. Do we automatically put such a student into both POS, do we sort the batch of students after the fact and assign some to one and others to the next? 

### Starting Place

For now, I will treat Coupled POS groups as multiple POSs. I will split them into n equally-demanded POS categories. Instead of 1 POS of "Sport & Human Performance, Veterinary & Animal Science", there will be 2 POS with the same demand, "<strong>Sport & Human Performance</strong>" and "<strong>Veterinary & Animal Science</strong>". 

## Oakland Middle School as Test Case

In [8]:
# choice to get started is to split Coupled POS matches into separate, equally-demanded POS matches
def uncouple_pos_matches(demand_counter_object):
# NE meanning Non-Empty matches, only
    NE_pos_list = [] 
    for pos in demand_counter_object:
        # the 0 comes from missing / empty matches
        if pos[0] == 0:
            continue
        
        # Break up the Coupled POS matches into separate, equally-demanded POS matches
        if ',' in pos[0]:
            # temp list of elements
            l = pos[0].split(', ')
            for match in l:
                NE_pos_list.append([match, pos[1]])
        # If not coupled, pass in as a list rather than tuple for decrement process later? 
        else:
            NE_pos_list.append(list(pos))

    return NE_pos_list

In [97]:
# block 1
# gets block 1 roster
B1_roster = school_pos_df.id.to_list()
B2_roster = school_pos_df.id.to_list()
B3_roster = school_pos_df.id.to_list()
B4_roster = school_pos_df.id.to_list()

# prepare pos roster object, each pos will have an 'All' cat for checking for previously assigned students
# and individual blocks, POS are in desc order of demand
pos_rosters = {}
for pos in uncouple_pos_matches(school_demand['Oakland Middle School']):
    # pos is a 2-element list [pos_key, demand value]
    pos_rosters[pos[0]] = {
        'All':[],
        'B1':[],
        'B2':[],
        'B3':[],
        'B4':[],
    }

# to iterate through, need to get number of lg_rooms to know cap size per POS...
num_lg_rooms = cap_df.loc[cap_df.School == 'Oakland Middle School']['Number of Large Rooms'].values[0] 
# 1 for oakland
lg_vector, sm_vector = [50] * num_lg_rooms, [35] * (len(pos_rosters) - num_lg_rooms)
capacity_vector = lg_vector + sm_vector



In [10]:
# status of block roster
def roster_status(block='B1'):
    print(f'{block} Status')
    x = 0
    for key in list(pos_rosters.keys()):
        x += len(pos_rosters[key][block])
        print(key, len(pos_rosters[key][block]))
    print(f'{x}/{len(school_pos_df)}\n')  
    return

# keep B1_unassigned for starting with in the next block
# for filling other courses: (1) fill so that all underfilled evenly?, (2) <do it randomly> 
# Yes AND Filled by most empty POS thus people will be sorted alphabetically into unpredictable POS.
def backfill_POS(block='B1'):
    # initialize randomized remaining list
    r_unassigned = random.sample(unassigned, len(unassigned))
    
    # # for checking 
    # roster_status(block='B1')

    while len(r_unassigned) > 0:
        # get index of most empty POS
        ind = OS_vector.index(max(OS_vector))
        pos_key = list(pos_rosters.keys())[ind]
        # adding student to POS roster
        pos_rosters[pos_key]['All'].append(r_unassigned[0])
        pos_rosters[pos_key][block].append(r_unassigned[0])
        # noting student added to that POS
        OS_vector[ind] -= 1
        # removing student from unassigned list
        r_unassigned.pop(0)

    # tracking progress
    roster_status(block='B1')
    return len(r_unassigned)



In [79]:
# prioritizing those missed with matches

def check_for_missed(ranked_df, current_block):
    # looking at previous block
    prev_block = {
        'B2':'B1',
        'B3':'B2',
        'B4':'B3',
    }
    # will be unneccessary for first block assignments
    if current_block == 'B1':
        return ranked_df
    else:
        block = prev_block[current_block]

    # current list of students with POS match   
    student_ids = list(ranked_df.id)

    # initializing lcoations of priority students with POS match
    target_indices = []

    # checking for priority students
    for id in missed_with_matches[block]:
        if id in student_ids:
            ind = ranked_df.loc[ranked_df.id == id].index.values[0]
            target_indices.append(ind)
    
    # preparing new_index to reindex ranked_df before continuing
    new_index = [x for x in target_indices]
    # adding back existing student_ids
    new_index += [i for i in range(len(ranked_df)) if i not in new_index]

    # reindex ranked_df
    return ranked_df.reindex(labels=new_index, axis='index').reset_index(drop=True)
     
    
    

In [98]:
# Goal is to fill 1 pos at a time? or 1 block at a time
# assumes: pos_rosters, capacity_vector, school
# inputs: block number
# modifies: Block roster lists "B#_roster"
 
def fill_pos_block(block='B1'):
    # initialize reference block roster
    remaining_roster = eval(f'{block}_roster')
    pos_list = list(pos_rosters.keys())
    # creating open_seats_vector
    open_seats_vector = []
    # iterating over POS's in block until all students assigned pos during block
    for pos_key in pos_list:
        # initialize choice to filter by
        rankings = ['First', 'Second', 'Third', 'Fourth', 'Fifth', 'Sixth']
        r = 0
        rank = rankings[r]

        # initialize randomized slice of pos_df and index counter, check for priority students
        ranked = school_pos_df.loc[school_pos_df[rank] != 0].sample(frac=1).reset_index(drop=True)
        ranked = check_for_missed(ranked_df=ranked,current_block=block)
        i = 0

        # initialize size of course
        n = 0
        cap = capacity_vector[pos_list.index(pos_key)]

        # filling
        while n < cap:
            try:
                student, choice = ranked.iloc[i].id, ranked.iloc[i][rank]
            except IndexError:
                print(f'{school}, {n} of {cap} students assigned to {pos_key}, {rank} choice demand met')
                # move to next choice ranking
                r += 1
                # check if all rankings have been examined
                if r > 5:
                    print(f'{school}, {n} students assigned to {pos_key}, All choices examined.')
                    # check for remaining seats 
                    open_seats = cap - n
                    break
                # else update the df slice for consideration
                rank = rankings[r]
                ranked = school_pos_df.loc[school_pos_df[rank] != 0].sample(frac=1).reset_index(drop=True)
                ranked = check_for_missed(ranked_df=ranked,current_block=block)
                # reset index counter
                i = 0
                continue
            # NOTE: criteria for assigning student to pos: 
            #   ->(i) matched, 
            #   ->(ii) isn't already matched for current block, 
            #   ->(iii) isn't already matched for this pos prior
            if (pos_key in choice) and (student in remaining_roster) and (student not in pos_rosters[pos_key]['All']):
                # assign to master
                pos_rosters[pos_key]['All'].append(student)
                # assign to block specific
                pos_rosters[pos_key][block].append(student)
                n += 1
                remaining_roster.remove(student)
            i += 1
            if n == cap:
                print(f'{school}, {pos_key} {block} filled.')
                # hence no remaining open seats in POS
                open_seats = 0
        open_seats_vector.append(open_seats)
    # checkign status
    roster_status(block=block)

    return remaining_roster, open_seats_vector
        
        

In [99]:
missed_with_matches = {
    'B1':[],
    'B2':[],
    'B3':[],
    'B4':[],
}
for block in ['B1', 'B2','B3','B4']:
    # run assignment for POS's
    unassigned, OS_vector = fill_pos_block(block=block)
    # track students who were missed for next block priority
    missed_with_matches[block] = [x for x in unassigned if ('Missing' not in x) and ('Empty' not in x)]
    # backfill remaining open blocks with unassigned students in current block
    backfill_POS(block=block)
        
    


Oakland Middle School, Sport & Human Performance B1 filled.
Oakland Middle School, Veterinary & Animal Science B1 filled.
Oakland Middle School, 0 of 35 students assigned to Horticulture Sciences, First choice demand met
Oakland Middle School, 0 of 35 students assigned to Horticulture Sciences, Second choice demand met
Oakland Middle School, 16 of 35 students assigned to Horticulture Sciences, Third choice demand met
Oakland Middle School, Horticulture Sciences B1 filled.
Oakland Middle School, 2 of 35 students assigned to Audio Visual Production, First choice demand met
Oakland Middle School, 3 of 35 students assigned to Audio Visual Production, Second choice demand met
Oakland Middle School, 9 of 35 students assigned to Audio Visual Production, Third choice demand met
Oakland Middle School, 28 of 35 students assigned to Audio Visual Production, Fourth choice demand met
Oakland Middle School, Audio Visual Production B1 filled.
Oakland Middle School, 0 of 35 students assigned to Digita

In [100]:
# investigating who gets missed throughout the assignment rounds
l = []
for block in ['B1', 'B2','B3','B4']:
    print(block, len(missed_with_matches[block]))
    if block != 'B4':
        l += missed_with_matches[block]

print(len(l), len(set(l)))

B1 4
B2 9
B3 18
B4 27
31 24


In [106]:
# NOTE: need to initialize a seed for control 
# Are students that have some matches being missed consecutively? 
# w/o check for missing (2, 6, 9)
int1 = set(missed_with_matches['B1']).intersection(set(missed_with_matches['B2']))
int2 = set(missed_with_matches['B2']).intersection(set(missed_with_matches['B3']))
int3 = set(missed_with_matches['B3']).intersection(set(missed_with_matches['B4']))
int4 = set(missed_with_matches['B1']).intersection(set(missed_with_matches['B2'])).intersection(\
    set(missed_with_matches['B3'])).intersection(set(missed_with_matches['B4']))

print(len(int1), len(int2), len(int3), len(int4))

1 6 9 1


In [108]:
school_pos_df.loc[school_pos_df.id == '1490408']

Unnamed: 0,id,name,email,First,Second,Third,Fourth,Fifth,Sixth,Enough Choices
262,1490408,JACKSON MITYOK,jmityok000@student.rcschools.net,0,0,0,0,0,0,0


In [88]:
roster_status('B4')

B4 Status
Sport & Human Performance 40
Veterinary & Animal Science 35
Horticulture Sciences 35
Audio Visual Production 30
Digital Arts & Design 25
Criminal Justice & Correction Services 28
Cybersecurity 25
Coding 31
Human Services 25
Marketing Management 25
Banking & Finance 25
STEM Engineering 24
Business Management 24
Culinary Arts 24
Leadership in Government 24
420/420



In [84]:
for POS in list(pos_rosters.keys()):
    print(len(pos_rosters[POS]['All']),\
        len(pos_rosters[POS]['B1']),\
            len(pos_rosters[POS]['B2']),\
                len(pos_rosters[POS]['B3']),\
                    len(pos_rosters[POS]['B4']), POS)


190 50 50 50 40 Sport & Human Performance
140 35 35 35 35 Veterinary & Animal Science
140 35 35 35 35 Horticulture Sciences
135 35 35 35 30 Audio Visual Production
119 35 35 24 25 Digital Arts & Design
118 35 31 24 28 Criminal Justice & Correction Services
119 35 34 25 25 Cybersecurity
96 20 21 24 31 Coding
91 20 21 25 25 Human Services
90 20 21 24 25 Marketing Management
90 20 21 24 25 Banking & Finance
89 20 21 24 24 STEM Engineering
88 20 20 24 24 Business Management
88 20 20 24 24 Culinary Arts
87 20 20 23 24 Leadership in Government
