In [1]:
import pandas as pd
import numpy as np

In [None]:
"""
Goal: 
- Create a rudimentary matching algorithm 
- Note the process and decisions made here: 
Decisions: 
- Scope of Rough Draft: work for one school, then focus on generalizing functions/script
- Want to track the number of clusters that were dropped due to no matching POS per school
"""

In [None]:
# (1) Need to read in by_school match csv and tally the demand of YS clusters per school
# (2) Need to convert YS clusters into POSs per school
# (3) Need to determine capacity vector for school

In [2]:
schools = ['Oakland Middle School',
    'Siegel Middle School',
    'Whitworth-Buchanan Middle School',
    'Christiana Middle School',
    'Smyrna Middle School',
    'Stewarts Creek Middle School',
    'Rockvale Middle School',
    'Rocky Fork Middle School',
    'Blackman Middle School',
    'Thurman Francis Arts Academy',
    'Rock Springs Middle School',
    'LaVergne Middle School'
]

In [3]:
# checking volume vs. capacity per school
capacity_report = {
    'School':[],
    'Status':[],
    'Assigned Capacity':[],
    '8th Graders':[],
    'Number of Large Rooms':[],
    'Number of Small Rooms':[]
}


for school in schools:
    # read in planning document to get list of rooms
    if school == 'Whitworth-Buchanan Middle School':
        plan_df = pd.read_excel('planning.xlsx', sheet_name='Whitworth-Buchanan Middle Schoo')
    else:
        plan_df = pd.read_excel('planning.xlsx', sheet_name=school)
    # get the rooms per that school
    rooms = list(plan_df['MS Room #'])
    capacity = 0
    lg_rooms = 0
    sm_rooms = 0
    for room in rooms:
        if room in ['Library', 'Auditorium']:
            capacity += 50
            lg_rooms += 1
        else:
            capacity += 35
            sm_rooms += 1

    # determine volume from YS match roster
    path = f'YS_Criteria_by_School/{school} YSCriteria.csv'
    ys_df = pd.read_csv(path)
    volume = len(ys_df)

    # compile dictionary for report
    capacity_report['School'].append(school)
    if capacity >= volume:
        capacity_report['Status'].append('Ready')
    else:
        capacity_report['Status'].append('Insufficient Space')
    capacity_report['Assigned Capacity'].append(capacity)
    capacity_report['8th Graders'].append(volume)
    capacity_report['Number of Large Rooms'].append(lg_rooms)
    capacity_report['Number of Small Rooms'].append(sm_rooms)
    

cap_df = pd.DataFrame(capacity_report)

In [5]:
cap_df['Status'] = cap_df['Assigned Capacity'] > cap_df['8th Graders']
cap_df

Unnamed: 0,School,Status,Assigned Capacity,8th Graders,Number of Large Rooms,Number of Small Rooms
0,Oakland Middle School,True,680,420,1,18
1,Siegel Middle School,True,555,430,2,13
2,Whitworth-Buchanan Middle School,True,625,322,2,15
3,Christiana Middle School,True,595,376,0,17
4,Smyrna Middle School,True,540,321,1,14
5,Stewarts Creek Middle School,True,555,386,2,13
6,Rockvale Middle School,True,575,537,1,15
7,Rocky Fork Middle School,True,540,276,1,14
8,Blackman Middle School,False,525,617,0,15
9,Thurman Francis Arts Academy,True,525,86,0,15


In [6]:
def get_POS_from_clusters(school):
    # read in YouScience Matching Df
    ys_path = f'YS_Criteria_by_School/{school} YSCriteria.csv'
    ys_match_df = pd.read_csv(ys_path)
    # Read in appropriate columns form direct_join_prepared.xlsx
    dj_path = f'../direct_join_prepared.xlsx'
    djp = pd.read_excel(dj_path)
    djp = djp[['YouScience Clusters',school]]

    # create replacement dictionary
    to_replace = {}
    for i in range(len(djp)):
        cluster = djp.iloc[i]
        # key = YS cluster from 'YouScience Clusters' column, value = school's coresponding POS from {school} column
        if type(cluster[school]) != float:
            to_replace[cluster['YouScience Clusters']] = cluster[school]
        else:
            to_replace[cluster['YouScience Clusters']] = 0
    
    # at some point '0' is introduced somewhere. I suspect from the added positions in updated rosters
    to_replace['0'] = 0

    return ys_match_df.replace(to_replace=to_replace).drop('Unnamed: 0', axis=1)


In [11]:
# need to be careful not to read a grouping of POS's as a single POS
# groupings will be ', ' seperated 
def get_POS_demand(school_pos_df):

    from collections import Counter
    C = Counter()
    for i in range(len(school_pos_df)):
        # read in student's POS
        student = school_pos_df.iloc[i]
        # compile list of POS
        pos_list = []
        for rank in ['First','Second','Third','Fourth','Fifth','Sixth']:
            if student[rank] not in pos_list:
                pos_list.append(student[rank])
        # update counter
        C.update(pos_list)
    
    # checking for problems, assumes cap_df exists
    lg_rooms = cap_df.loc[cap_df.School == school]['Number of Large Rooms'].values[0]
    sm_rooms = cap_df.loc[cap_df.School == school]['Number of Small Rooms'].values[0]
    print(f'{school} registered {lg_rooms} large classroom(s) available.')
    need_lg = 0
    for i in C.most_common():
        if i[0] == 0:
            continue
        if lg_rooms > 0:
            check = i[1] <= 200
            lg_rooms -= 1
        else:
            check = i[1] <= 140
        
        if check == False:
            need_lg += 1
    print(f'{school} needs {need_lg} additional large rooms to meet POS demand.')
        
    return C

In [31]:
# dictionary for demand by school 
school_demand = {}

for school in schools:
    # Read in school's YS matches and convert them to POS's, replace nan with 0s
    school_pos_df = get_POS_from_clusters(school)
    # Want to calculate POS demand to assess any issues with allocation of rooms
    school_demand[school] = get_POS_demand(school_pos_df).most_common()
    # Need to execute matching here -OR- export pos_df into usable format for later
    break

Oakland Middle School registered 1 large classroom(s) available.
Oakland Middle School needs 2 additional large rooms to meet POS demand.


In [42]:
# making sure ids are unique (420, 420)
len(school_pos_df), len(school_pos_df.id.unique())

(420, 420)

# Match Philosophy

I think it makes the most sense to iterate through the highest demanded Pos choices first. While the most commonly selected POS does not mean it was the most commonly picked first choice, it does provide a reasonable stand point for iteration. There is an obvious demand for the POS in question and we can prioritize those that picked it for their first and second choices, but we must start somewhere. 

## Handling Coupled POS

The second largest problem I see is that some YS clusters are mapped to a group of POS options rather than 1-1. For instance, some students might have matched with the 'POS' <i>"Sport & Human Performance, Veterinary & Animal Science"</i>. Do we automatically put such a student into both POS, do we sort the batch of students after the fact and assign some to one and others to the next? 

### Starting Place

For now, I will treat Coupled POS groups as multiple POSs. I will split them into n equally-demanded POS categories. Instead of 1 POS of "Sport & Human Performance, Veterinary & Animal Science", there will be 2 POS with the same demand, "<strong>Sport & Human Performance</strong>" and "<strong>Veterinary & Animal Science</strong>". 

## Oakland Middle School as Test Case

In [55]:
# choice to get started is to split Coupled POS matches into separate, equally-demanded POS matches
def uncouple_pos_matches(demand_counter_object):
# NE meanning Non-Empty matches, only
    NE_pos_list = [] 
    for pos in demand_counter_object:
        # the 0 comes from missing / empty matches
        if pos[0] == 0:
            continue
        
        # Break up the Coupled POS matches into separate, equally-demanded POS matches
        if ',' in pos[0]:
            # temp list of elements
            l = pos[0].split(', ')
            for match in l:
                NE_pos_list.append([match, pos[1]])
        # If not coupled, pass in as a list rather than tuple for decrement process later? 
        else:
            NE_pos_list.append(list(pos))

    return NE_pos_list

In [126]:
# block 1
# gets block 1 roster
B1_roster = school_pos_df.id.to_list()
B2_roster = school_pos_df.id.to_list()
B3_roster = school_pos_df.id.to_list()
B4_roster = school_pos_df.id.to_list()

# prepare pos roster object, each pos will have an 'All' cat for checking for previously assigned students
# and individual blocks, POS are in desc order of demand
pos_rosters = {}
for pos in uncouple_pos_matches(school_demand['Oakland Middle School']):
    # pos is a 2-element list [pos_key, demand value]
    pos_rosters[pos[0]] = {
        'All':[],
        'B1':[],
        'B2':[],
        'B3':[],
        'B4':[],
    }

# to iterate through, need to get number of lg_rooms to know cap size per POS...
num_lg_rooms = cap_df.loc[cap_df.School == 'Oakland Middle School']['Number of Large Rooms'].values[0] 
# 1 for oakland
lg_vector, sm_vector = [50] * num_lg_rooms, [35] * (len(pos_rosters) - num_lg_rooms)
capacity_vector = lg_vector + sm_vector
comparison_vector = [] * len(capacity_vector)


In [127]:
# Goal is to fill 1 pos at a time? or 1 block at a time
# assumes: pos_rosters, capacity_vector, school
# inputs: block number
# modifies: Block roster lists "B#_roster"
 
def fill_pos_block(block='B1'):
    # initialize reference block roster
    remaining_roster = eval(f'{block}_roster')
    pos_list = list(pos_rosters.keys())
    # iterating over POS's in block until all students assigned pos during block
    for pos_key in pos_list:
        # initialize choice to filter by
        rankings = ['First', 'Second', 'Third', 'Fourth', 'Fifth', 'Sixth']
        r = 0
        rank = rankings[r]

        # initialize slice of pos_df, and index counter
        ranked = school_pos_df.loc[school_pos_df[rank] != 0].reset_index(drop=True)
        i = 0

        # initialize size of course
        n = 0
        cap = capacity_vector[pos_list.index(pos_key)]

        # filling
        while n < cap:
            try:
                student, choice = ranked.iloc[i].id, ranked.iloc[i][rank]
            except IndexError:
                print(f'{school}, {n} of {cap} students assigned to {pos_key}, {rank} choice demand met')
                # move to next choice ranking
                r += 1
                # check if all rankings have been examined
                if r > 5:
                    print(f'{school}, {n} students assigned to {pos_key}, All choices examined.')
                    # append the number of open seats remaining in pos 
                    open_seats = cap - n
                    pos_rosters[pos_key][block]
                    break
                # else update the df slice for consideration
                rank = rankings[r]
                ranked = school_pos_df.loc[school_pos_df[rank] != 0].reset_index(drop=True)
                # reset index counter
                i = 0
                continue
            # criteria for assigning student to pos
            if (pos_key in choice) and (student in remaining_roster):
                # assign to master
                pos_rosters[pos_key]['All'].append(student)
                # assign to block specific
                pos_rosters[pos_key][block].append(student)
                n += 1
                remaining_roster.remove(student)
            i += 1
            if n == cap:
                print(f'{school}, {pos_key} {block} filled.')
    return remaining_roster
        
        

In [128]:
unasigned = fill_pos_block()

Oakland Middle School, Sport & Human Performance B1 filled.
Oakland Middle School, Veterinary & Animal Science B1 filled.
Oakland Middle School, 0 of 35 students assigned to Horticulture Sciences, First choice demand met
Oakland Middle School, 0 of 35 students assigned to Horticulture Sciences, Second choice demand met
Oakland Middle School, 19 of 35 students assigned to Horticulture Sciences, Third choice demand met
Oakland Middle School, Horticulture Sciences B1 filled.
Oakland Middle School, 2 of 35 students assigned to Audio Visual Production, First choice demand met
Oakland Middle School, 3 of 35 students assigned to Audio Visual Production, Second choice demand met
Oakland Middle School, 9 of 35 students assigned to Audio Visual Production, Third choice demand met
Oakland Middle School, 28 of 35 students assigned to Audio Visual Production, Fourth choice demand met
Oakland Middle School, Audio Visual Production B1 filled.
Oakland Middle School, 0 of 35 students assigned to Digita

In [123]:
capacity_vector

[50, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35]

In [129]:
x = 0
for key in list(pos_rosters.keys()):
    x += len(pos_rosters[key]['B1'])
    print(key, len(pos_rosters[key]['B1']))
print(x)  

Sport & Human Performance 50
Veterinary & Animal Science 35
Horticulture Sciences 35
Audio Visual Production 35
Digital Arts & Design 35
Criminal Justice & Correction Services 35
Cybersecurity 35
Coding 8
Human Services 12
Marketing Management 22
Banking & Finance 6
STEM Engineering 4
Business Management 1
Culinary Arts 0
Leadership in Government 2
315


In [91]:
#NOTE: does not handle cases where there is not enough demand to be found for POS, 
# it would just iterate through all choices and return the number assigned

rankings = ['First', 'Second', 'Third', 'Fourth', 'Fifth', 'Sixth']
r = 0
rank = rankings[r]
pos = list(pos_rosters.keys())[0]
roster = [] # will actually be a specfic entry in pos_rosters: pos_rosters[pos_key]['All'] & pos_rosters[pos_key][block]
m_roster = list(school_pos_df.id)
# slice
#first = school_pos_df.loc[(school_pos_df.First != 0)].reset_index(drop=True)
ranked = school_pos_df.loc[school_pos_df[rank] != 0].reset_index(drop=True)
school = 'Oakland Middle School'
n = 0
i = 200
while n < 50: # 50 or 35 will come from capacity_vector
    # need to create a function that resets choices? Maybe picks also? 
    try:
        student, choice = ranked.iloc[i].id, ranked.iloc[i][rank]
    except IndexError:
        print(f'{school}: {n} students assigned to POS: {pos}, {rank} choice demand met')
        # FUNCTION: move on to next choice ranking
        r += 1
        if r > 6: #FUNCTION: need to create a policy in the event that choices are spent and still have capacity
            print(f'{n} students assigned to POS: {pos}, All choices examined.')
            break
        # FUNCTION: update which ranking slice we are looking at
        rank = rankings[r]
        ranked = school_pos_df.loc[school_pos_df[rank] != 0].reset_index(drop=True)
        # FUNCTION: make sure i goes back to 0 for the new slice
        i = 0
    if (pos in choice) and (student in m_roster):
        roster.append(student)
        n += 1
        m_roster.remove(student)
    i += 1
    if n == 50:
        print(f'{school}: {pos} filled.')

# appending number of seats open in session
open_seats = 50 - n # 50 or 35 will come from capacity_vector
roster.append(open_seats)

print(len(roster), len(m_roster), len(roster) + len(m_roster), "Open seats:",open_seats)


13 students assigned to POS: Sport & Human Performance, First choice demand met
32 students assigned to POS: Sport & Human Performance, Second choice demand met
46 students assigned to POS: Sport & Human Performance, Third choice demand met
46 students assigned to POS: Sport & Human Performance, Fourth choice demand met
51 370 421 Open seats: 0


## Unfilled POS matches...

We need clauses that address what happens when a POS doesn't fill (i.e., they have a room for 35, but 22 matches for that session), and when / if to autofill with bodies in the room. <br>

Should we iterate through demand and then backfill leftover students? Could append each list after the above iteration with the number of seats available in the session. 
Ex. "32 students assigned to POS: Sport & Human Performance, All choices examined." So it might be appended with a 3 as a place holder. So when we are autofilling students, we can iterate back through the block's roster and read in the last entry to see if it's filled. And fill with matches in most demanded POS's until no unassigned schedules for that block. 

### POS Roster-Block Management

If we do it this way, we will need to make sure that we recall that the len of these lists will be 1 more than how many matches are present. Moreover, the last entry will need to be removed before adding unassigned students and then reappended with the new number of open_seats. 

In [None]:
# Need clause for under-demanded POS (i.e., say room for 35 students, only 15 matched)

# condition for while unassigned exist: if len(m_roster) > 0, will be B#_roster later
j = 0
while len(m_roster) > 0:
    # assumed going into this that a block has been selected
    # block = ...
    # iterate through pos_roster[pos_key][block] lists reading open_seats
    pos_key = list(pos_rosters.keys())[j]
    # read in open_seats for pos
    open_seats = roster.pop(-1) # pos_rosters[pos_key][block][-1]
    if open_seats == 0:
        # replace the open_seats marker
        roster.append(open_seats)
        j+=1 # move on to next pos_key
    else:
        print(f'{pos_key} open: {open_seats}, assigning matches...')

        # fill unassigned students clause
        for i in range(len(open_seats)):
            try:
                # add student to pos_rosters[pos_key][block]
                student = m_roster[0]
                m_roster.remove(student)
                roster.append(student)
            except IndexError:
                break
        
        j+=1 # signal next pos_key
            