In [1]:
""" 
NOTE: Under revision to incorporate Pathway roster export
"""

import pandas as pd
import numpy as np
import random
import os

# implementing random seed for control
sd = 42
np.random.seed(sd)

In [2]:
# school names for consistency 
schools = ['Oakland Middle School',
    'Siegel Middle School',
    'Whitworth-Buchanan Middle School',
    'Christiana Middle School',
    'Smyrna Middle School',
    'Stewarts Creek Middle School',
    'Rockvale Middle School',
    'Rocky Fork Middle School',
    'Blackman Middle School',
    'Thurman Francis Arts Academy',
    'Rock Springs Middle School',
    'LaVergne Middle School'
]

# capacity report compiled info for all schools at once, see capacity_report.py

# preparing directories
for school in schools:
    path = f'../YouScienceData/Schedules/{school}'
    pathway_path = f'../YouScienceData/Schedules/{school}/Pathway_Rosters'
    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.exists(pathway_path):
        os.makedirs(pathway_path)

# Functions
Note that file_paths will need to be updated for change over to .py scripts

<ul>
    <li><strong>get_POS_from_clusters</strong> (<i>school</i>), Returns df with translated POS or 0's for missing, feeds from YS_Criteria_by_School prepared sets</li></br>
    <li><strong>get_POS_demand</strong> (<i>school_pos_df</i>), Needs converted POS df, Returns Counter dictionary object with POS demand count, prints <u># lg rooms and if more are needed based on demand</u></li></br>
    <li><strong>uncouple_pos_matches</strong> (<i>pos_demand_counter_object</i>), returns a desc list of POS by demand, splits grouped POSs into separate POS each with the same demand, is called by fnc initialize_block_rosters</li></br>
    <li><strong>initialize_block_rosters</strong> (<i>school</i>), returns pos_rosters empty dictionary structured for building block rosters, returns capacity vector which is used for determining sizes of pos rosters, calls fnc uncouple_pos_rosters</li></br>
    <li><strong>roster_status</strong> <i>block='B1'</i>, shows current block status, initially was designed to check for fill, is called by fnc backfill_POS(), prints <u>#'s in each POS per specified block</u></li></br>
    <li><strong>backfill_POS</strong> (<i>block='B1'</i>), returns length of unassigned for block (0 implies block is completely backfilled), uses global "<strong>unassigned</strong>", calls roster_status at end to show results</li></br>
    <li><strong>fill_pos_block</strong> (<i>block='B1'</i>), returns remaining roster of unmatched students and open seat vector of POS that still have empty seats, assumes global <strong>B1_roster, B2_roster, B3_roster, B4_roster</strong> are initialized with list of students, per block it fills POS's in order of demand by finding students with matches for that POS as First match, then Second and so on until the POS is filled by capacity or all students with matches have been sorted, prints <u>Current match level being examined and current capacity of POS, flags when POS is empty after demanded matches are checked or when POS is filled</u></li></br>
    <li><strong>check_for_missed</strong> (<i>ranked_df, current_block</i>), is called within fnc fill_POS_block(), returns ranked_df during matching process to prioritize students who were unmatched after previous block's sorting</li></br>
    <li><strong>append_schedule</strong> (<i>-none-</i>), assumed global <strong>school_pos_df, pos_rosters</strong>, appends 4 columns "B1,...,B4" to translated school_POS_df by using the pos_rosters after a school's sorting is complete</li></br>
    <li><strong>clean_to_export</strong> (<i>school_pos_df</i>), assumed global <strong>school_pos_df</strong>, tidies larger school_pos_df to specfic features to send to board. Created in case additional transformations are needed during development.</li></br>
    <li><strong>last_name</strong> (<i>name</i>), used to extract last name from school_pos_df in order to alphabetize cleaned_df to be teacher-friendly</li>
    <li><strong>export_POS_rosters</strong> (<i>pos_keys, cleaned_df</i>), default pos_keys assumes pos_rosters dict object, default cleaned_df assumes cleaned_df df object, creates a .xlsx file per pathway for a school that contains four blocks' rosters in near-perfect az order and index set to begin with 1 for ease of teacher access.</li>
</ul>

In [22]:
def get_POS_from_clusters(school):
    # read in YouScience Matching Df
    ys_path = f'YS_Criteria_by_School/{school} YSCriteria.csv'
    ys_match_df = pd.read_csv(ys_path)
    # Read in appropriate columns form direct_join_prepared.xlsx
    dj_path = f'../direct_join_prepared.xlsx'
    djp = pd.read_excel(dj_path)
    djp = djp[['YouScience Clusters',school]]

    # create replacement dictionary
    to_replace = {}
    for i in range(len(djp)):
        cluster = djp.iloc[i]
        # key = YS cluster from 'YouScience Clusters' column, value = school's coresponding POS from {school} column
        if type(cluster[school]) != float:
            to_replace[cluster['YouScience Clusters']] = cluster[school]
        else:
            to_replace[cluster['YouScience Clusters']] = 0
    
    # at some point '0' is introduced somewhere. I suspect from the added positions in updated rosters
    to_replace['0'] = 0

    return ys_match_df.replace(to_replace=to_replace).drop('Unnamed: 0', axis=1)

def get_POS_demand(school_pos_df):

    from collections import Counter
    C = Counter()
    for i in range(len(school_pos_df)):
        # read in student's POS
        student = school_pos_df.iloc[i]
        # compile list of POS
        pos_list = []
        for rank in ['First','Second','Third','Fourth','Fifth','Sixth']:
            if student[rank] not in pos_list:
                pos_list.append(student[rank])
        # update counter
        C.update(pos_list)
    
    # Updated 11/01 to read from capacity_report.csv
    cap_df = pd.read_csv('../YouScienceData/Reports/Capacity_Report.csv')
    # checking for problems, assumes cap_df exists
    lg_rooms = cap_df.loc[cap_df.School == school]['Number of Large Rooms'].values[0]
    sm_rooms = cap_df.loc[cap_df.School == school]['Number of Small Rooms'].values[0]
    print(f'{school} registered {lg_rooms} large classroom(s) available.')
    need_lg = 0
    for i in C.most_common():
        if i[0] == 0:
            continue
        if lg_rooms > 0:
            check = i[1] <= 200
            lg_rooms -= 1
        else:
            check = i[1] <= 140
        
        if check == False:
            need_lg += 1
    print(f'{school} needs {need_lg} additional large rooms to meet POS demand.')
        
    return C

def uncouple_pos_matches(demand_counter_object):
# NE meanning Non-Empty matches, only
    NE_pos_list = [] 
    for pos in demand_counter_object:
        # the 0 comes from missing / empty matches
        if pos[0] == 0:
            continue
        
        # Break up the Coupled POS matches into separate, equally-demanded POS matches
        if ',' in pos[0]:
            # temp list of elements
            l = pos[0].split(', ')
            for match in l:
                NE_pos_list.append([match, pos[1]])
        # If not coupled, pass in as a list rather than tuple for decrement process later? 
        else:
            NE_pos_list.append(list(pos))

    return NE_pos_list

def initialize_block_rosters(school):
    # initialize dictionary for POS rosters per school
    pos_rosters = {}
    
    for pos_couplet in uncouple_pos_matches(school_demand[school]):
        # pos_key will be a doubleton set [pos_key, demand value]
        pos_rosters[pos_couplet[0]] = {
            'All':[],
            'B1':[],
            'B2':[],
            'B3':[],
            'B4':[],
        }

    # preparing capacity vector
    cap_df = pd.read_csv('../YouScienceData/Reports/Capacity_Report.csv')
    num_lg_rooms = cap_df.loc[cap_df.School == school]['Number of Large Rooms'].values[0] 
    lg_vector, sm_vector = [50] * num_lg_rooms, [35] * (len(pos_rosters) - num_lg_rooms)
    capacity_vector = lg_vector + sm_vector

    return pos_rosters, capacity_vector

def roster_status(block='B1'):
    print(f'{block} Status')
    logf.write(f'{block}_Status\n')
    x = 0
    for key in list(pos_rosters.keys()):
        x += len(pos_rosters[key][block])
        print(key, len(pos_rosters[key][block]))
        logf.write(f'{block}_Status: {key}, {len(pos_rosters[key][block])}\n')
    print(f'{x}/{len(school_pos_df)}\n')  
    logf.write(f'{block}_Status: {x}/{len(school_pos_df)}\n')
    return

def backfill_POS(block='B1'):
    # initialize randomized remaining list
    r_unassigned = random.sample(unassigned, len(unassigned))

    while len(r_unassigned) > 0:
        # get index of most empty POS
        ind = OS_vector.index(max(OS_vector))
        pos_key = list(pos_rosters.keys())[ind]
        # adding student to POS roster
        pos_rosters[pos_key]['All'].append(r_unassigned[0])
        pos_rosters[pos_key][block].append(r_unassigned[0])
        # noting student added to that POS
        OS_vector[ind] -= 1
        # removing student from unassigned list
        r_unassigned.pop(0)

    # tracking progress
    roster_status(block=block)
    return len(r_unassigned)

def fill_pos_block(block='B1'):
    # initialize reference block roster
    remaining_roster = eval(f'{block}_roster')
    pos_list = list(pos_rosters.keys())
    # creating open_seats_vector
    open_seats_vector = []
    # iterating over POS's in block until all students assigned pos during block
    for pos_key in pos_list:
        # initialize choice to filter by
        rankings = ['First', 'Second', 'Third', 'Fourth', 'Fifth', 'Sixth']
        r = 0
        rank = rankings[r]

        # initialize randomized slice of pos_df and index counter, check for priority students
        ranked = school_pos_df.loc[school_pos_df[rank] != 0].sample(frac=1).reset_index(drop=True)
        ranked = check_for_missed(ranked_df=ranked,current_block=block)
        i = 0

        # initialize size of course
        n = 0
        cap = capacity_vector[pos_list.index(pos_key)]

        # filling
        while n < cap:
            try:
                student, choice = ranked.iloc[i].id, ranked.iloc[i][rank]
            except IndexError:
                #print(f'{school}, {n} of {cap} students assigned to {pos_key}, {rank} choice demand met')
                logf.write(f'fill_status: {school}, {n} of {cap} students assigned to {pos_key}, {rank} choice demand met\n')
                # move to next choice ranking
                r += 1
                # check if all rankings have been examined
                if r > 5:
                    #print(f'{school}, {n} students assigned to {pos_key}, All choices examined.')
                    logf.write(f'fill_status: {school}, {n} students assigned to {pos_key}, All choices examined.\n')
                    # check for remaining seats 
                    open_seats = cap - n
                    break
                # else update the df slice for consideration
                rank = rankings[r]
                ranked = school_pos_df.loc[school_pos_df[rank] != 0].sample(frac=1).reset_index(drop=True)
                ranked = check_for_missed(ranked_df=ranked,current_block=block)
                # reset index counter
                i = 0
                continue
            # NOTE: criteria for assigning student to pos: 
            #   ->(i) matched, 
            #   ->(ii) isn't already matched for current block, 
            #   ->(iii) isn't already matched for this pos prior
            if (pos_key in choice) and (student in remaining_roster) and (student not in pos_rosters[pos_key]['All']):
                # assign to master
                pos_rosters[pos_key]['All'].append(student)
                # assign to block specific
                pos_rosters[pos_key][block].append(student)
                n += 1
                remaining_roster.remove(student)
            i += 1
            if n == cap:
                #print(f'{school}, {pos_key} {block} filled.')
                logf.write(f'fill_status: {school}, {pos_key} {block} filled.\n')
                # hence no remaining open seats in POS
                open_seats = 0
        open_seats_vector.append(open_seats)
    # checkign status
    roster_status(block=block)

    return remaining_roster, open_seats_vector

def check_for_missed(ranked_df, current_block):
    # looking at previous block
    prev_block = {
        'B2':'B1',
        'B3':'B2',
        'B4':'B3',
    }
    # will be unneccessary for first block assignments
    if current_block == 'B1':
        return ranked_df
    else:
        block = prev_block[current_block]

    # current list of students with POS match   
    student_ids = list(ranked_df.id)

    # initializing lcoations of priority students with POS match
    target_indices = []

    # checking for priority students
    for id in missed_took_YS[school][block]:
        if id in student_ids:
            ind = ranked_df.loc[ranked_df.id == id].index.values[0]
            target_indices.append(ind)
    
    # preparing new_index to reindex ranked_df before continuing
    new_index = [x for x in target_indices]
    # adding back existing student_ids
    new_index += [i for i in range(len(ranked_df)) if i not in new_index]

    # reindex ranked_df
    return ranked_df.reindex(labels=new_index, axis='index').reset_index(drop=True)

# designed more with the idea of a class object in mind
def append_schedule():
    # create empty columns
    for block in ['B1','B2','B3','B4']:
        school_pos_df[block] = ''

    # iterate through POS offerings to fill in schedule B1-B4
    for pos_key in list(pos_rosters.keys()):
        # iterate through blocks
        for block in ['B1','B2','B3','B4']:
            # iterate through Block roster
            for id in pos_rosters[pos_key][block]:
                # find and write pos under student's corresponding slot
                # row indexer
                row = school_pos_df.loc[school_pos_df.id == id].index.values
                school_pos_df.loc[school_pos_df.index[row], block] = pos_key
                
    return

def last_name(name):
    return name.split(',')[0].split()[-1]

def clean_to_export(school_pos_df):
    features = [
        'id',
        'name',
        'email',
        'B1',
        'B2',
        'B3',
        'B4',
    ]
    
    # alphabetize students
    school_pos_df.sort_values(['Last Name'], inplace=True)

    # space here if we wish to transform further during iteration
    return school_pos_df[features]

pos_str_exceptions = set()

def export_POS_rosters(pos_keys, cleaned_df):

    for pos_key in pos_keys:
        # fixing unexpected '/' in pos_key str
        new_key = pos_key.replace('/', ', ')
            
        if new_key != pos_key:
            pos_str_exceptions.add(pos_key)
        
        # get block slices, index starting with 1 (teacher-friendly)
        b1_df = cleaned_df.loc[cleaned_df.B1 == pos_key]
        b1_df.index = np.arange(1, len(b1_df) + 1)
        b2_df = cleaned_df.loc[cleaned_df.B2 == pos_key]
        b2_df.index = np.arange(1, len(b2_df) + 1)
        b3_df = cleaned_df.loc[cleaned_df.B3 == pos_key]
        b3_df.index = np.arange(1, len(b3_df) + 1)
        b4_df = cleaned_df.loc[cleaned_df.B4 == pos_key]
        b4_df.index = np.arange(1, len(b4_df) + 1)

        # create .xlsx file for pathway
        with pd.ExcelWriter(f'../YouScienceData/Schedules/{school}/Pathway_Rosters/{new_key}.xlsx') as exwriter:
            b1_df.to_excel(exwriter, sheet_name='Block 1')
            b2_df.to_excel(exwriter, sheet_name='Block 2')
            b3_df.to_excel(exwriter, sheet_name='Block 3')
            b4_df.to_excel(exwriter, sheet_name='Block 4')


# Execution

In [26]:
log_name = input("Input file_name for log file")
with open(f'../Logs/{log_name}.txt','w') as logf:


    # dictionary for demand by school for later evaluation
    school_demand = {}
    # tracking students who were unassigned during each block but who took YS test
    missed_took_YS = {}

    for school in schools:
        # Read in school's YS matches and convert them to POS's, replace nan with 0s
        school_pos_df = get_POS_from_clusters(school)
        # initialize block rosters to contain entire school at start
        # will be reduced through process as students get assigned 
        B1_roster = school_pos_df.id.to_list()
        B2_roster = school_pos_df.id.to_list()
        B3_roster = school_pos_df.id.to_list()
        B4_roster = school_pos_df.id.to_list()
        # Want to calculate POS demand to assess any issues with allocation of rooms
        school_demand[school] = get_POS_demand(school_pos_df).most_common()
        # initialize block rosters and capacity vector for school
        pos_rosters, capacity_vector = initialize_block_rosters(school)
        # begin assignment & initialize tracker with dictionary per school
        missed_took_YS[school] = {
            'B1':[],
            'B2':[],
            'B3':[],
            'B4':[],
        }
        for block in ['B1', 'B2','B3','B4']:
            # fill forwards
            unassigned, OS_vector = fill_pos_block(block=block)
            # update tracker for those missed who took YS test
            missed_took_YS[school][block] = [x for x in unassigned if ('Missing' not in x) and ('Empty' not in x)]
            # fill backwards
            backfill_POS(block=block)
        
        append_schedule()
        # building last name feature to alphabetize cleaned_df
        for name in school_pos_df.name:
            school_pos_df.loc[school_pos_df.name == name, 'Last Name'] = last_name(name=name)
            
        cleaned_df = clean_to_export(school_pos_df)
        # export school-wide Pathway Schedule
        cleaned_df.to_csv(f'../YouScienceData/Schedules/{school}/{school}_all_students.csv')
        # export Pathway-specific rosters
        export_POS_rosters(pos_keys=list(pos_rosters.keys()), cleaned_df=cleaned_df)
        
logf.close()

Oakland Middle School registered 1 large classroom(s) available.
Oakland Middle School needs 3 additional large rooms to meet POS demand.
B1 Status
BioSTEM 50
Dietetics & Nutrition 35
Veterinary & Animal Science/Horticulture Science 35
Sport & Human Performance 35
Teaching as a Profession 35
Agricultural Engineering & Applied Technologiess 35
Audio Visual Production 35
Digital Arts & Design 13
Mechantronics 28
Human Services 10
Marketing Management 6
Banking & Finance 3
MEP Systems 1
Automotive Maintenance and Light Repair 0
Business Management 0
Residential & Commercial Construction 0
Culinary Arts 0
Leadership in Government 0
321/420

B1 Status
BioSTEM 50
Dietetics & Nutrition 35
Veterinary & Animal Science/Horticulture Science 35
Sport & Human Performance 35
Teaching as a Profession 35
Agricultural Engineering & Applied Technologiess 35
Audio Visual Production 35
Digital Arts & Design 14
Mechantronics 28
Human Services 14
Marketing Management 13
Banking & Finance 13
MEP Systems 13
A

OSError: Cannot save file into a non-existent directory: '../YouScienceData/Schedules/Oakland Middle School/Pathway_Rosters/Veterinary & Animal Science'

## For Exporting POS rosters

As is, per school, the nested dictionary object "pos_rosters[pos_key][All/block]" contains ids only. Each feature that makes sense to have on hand for a teacher leading a Pathway forum (based on my experience in the classroom) is already prepared in the cleaned_df.  <br/><br/>
For draft 1, these features pending review:</br>
<ul>
    <li><i>Indexed to begin with 1 for teacher counting, alphabetized</i><br/>*Strictly alphabetized by last name without considering first name in case of shared last, etc.</li>
    <li><strong>Block</strong></li>
    <li><strong>Student Name</strong> <br/>(<i>feature of school_pos_df</i>)</li>
    <li><strong>Student id</strong> <br/>(<i>key between pos_roster and school_pos_df</i>)</li>
</ul>

One block per sheet or file. It might be best to have an excel file per POS with 5 sheets, one for 'All', and four for the individual blocks. </br>

<strong><i>BONUS:</i></strong> Additionally, a single .txt file per POS with the Pathway's teacher name as a title. Just thinking minimizing the lift per school.

In [None]:
pos_keys = list(pos_rosters.keys())
for pos_key in pos_keys:
    # getting block 1 slice and reindexing to be intuitive to teachers
    b1_df = cleaned_df.loc[cleaned_df.B1 == pos_key]
    b1_df.index = np.arange(1, len(b1_df) + 1)
    b2_df = cleaned_df.loc[cleaned_df.B2 == pos_key]
    b2_df.index = np.arange(1, len(b2_df) + 1)
    b3_df = cleaned_df.loc[cleaned_df.B3 == pos_key]
    b3_df.index = np.arange(1, len(b3_df) + 1)
    b4_df = cleaned_df.loc[cleaned_df.B4 == pos_key]
    b4_df.index = np.arange(1, len(b4_df) + 1)
    
    with pd.ExcelWriter(f'../YouScienceData/Schedules/{school}/Pathway_Rosters/{pos_key}.xlsx') as exwriter:
        b1_df.to_excel(exwriter, sheet_name='Block 1')
        b2_df.to_excel(exwriter, sheet_name='Block 2')
        b3_df.to_excel(exwriter, sheet_name='Block 3')
        b4_df.to_excel(exwriter, sheet_name='Block 4')


In [29]:
list(pos_rosters.keys())[2].replace('/', ', ')

'Veterinary & Animal Science, Horticulture Science'

In [27]:
pos_str_exceptions

{'Veterinary & Animal Science/Horticulture Science'}