# Overview

Purpose is to compile csv files per school that contain the students of that school with their RuCo-determined-ranked pathways of study (POS). Rather than have 3 records per student from raw YouScience (student_sliceS) files, each student will have a single record with their 6 ranked POS, or blank POS if their student_sliceouScience info was not found. </br>

### Current Featuers (12) of Criteria Table
<ul>
    <li><strong><i>"id"</i></strong>, 3 kinds of results,
    <ol>
        <li>the student's school id provided by YS file</li> 
        <li>'Missing #' if the student is on the school's skyward roster but does not have YS results</li>
        <li>'Empty #' which is a placeholder student for last minute additions for the event day of.</li>
    </ol>
    <li><i><s>"name"</s><strong>"First_Name"/"Last_Name"</strong></i>, Initially I had combined into one feature, but at several points later in the algorithm I split them back. Just reverted to two features.</li> 
    <li><strong><i>"Email"</i></strong>, remains the strongest method of matching student records between tables.</li>
    <li><strong><i>"School"</i></strong>, added in Beta in case check was needed to confirm school.</li>
    <li><strong><i>"First"-"Sixth"</i></strong>, spelled-out featuers that are the ranked YS clusters per RuCo criteria</li>
    <li><strong><i>"Enough_Choices</i></strong>, initial flag for checking if a student had at least 4 unique YS clusters to determine if a perfect schedule is even possible for school.*</li>
</ul>

\* While it is not currently useful, it was a thought towards future development regarding creating metrics for measuring the efficacy of the algorithm as well as the capacity of the schools to meet the demamnds of their students. <strong>NOTE</strong>: As is, it does not measure either. 

### RuCo Ranking Methodology

Ranked Clusters by criteria (1 being <strong><i>most</i></strong> prioritized, 6 being <strong><i>least</i></strong> prioritized):
<ol>
    <li>1st aptitude_fit</li>
    <li>1st interest_fit</li>
    <li>2nd aptitude_fit</li>
    <li>3rd aptitude_fit</li>
    <li>2nd interest_fit</li>
    <li>3rd interest_fit</li>
</ol>


In [1]:
import pandas as pd
import numpy as np
import random
import datetime
import os

# implementing random seed for control
np.random.seed(42)

# school names for consistency 
schools = ['Oakland Middle School',
    'Siegel Middle School',
    'Whitworth-Buchanan Middle School',
    'Christiana Middle School',
    'Smyrna Middle School',
    'Stewarts Creek Middle School',
    'Rockvale Middle School',
    'Rocky Fork Middle School',
    'Blackman Middle School',
    # 'Thurman Francis Arts Academy',
    'Rock Springs Middle School',
    'LaVergne Middle School'
]

# pad rosters/schedules with n extra unassigned 
pad_n_unassigned_places = 15

In [2]:
# get event year
try:
    event_year = int(input('Enter 4-digit year of YouScience Event'))
except ValueError:
    event_year = 2022
print(f"Running for YS Career Fair {event_year}")

# establishing file folder path
ys_criteria_p = f'../YouScienceData/YS_Criteria_by_School/{event_year}/'

# checking if Criteria directory exists
if not os.path.exists(ys_criteria_p):
    os.makedirs(ys_criteria_p)


Running for YS Career Fair 2022


In [3]:
# reads in multiple records per student -> dict with single record per student
def get_ranked_choices(school, event_year = event_year):
    # Updated YS csv file path
    p = f'../YouScienceData/Updated_YouScience/{event_year}/{school}_YouScience.csv'
    df = pd.read_csv(p)

    # criteria table schematic
    criteria = {
        'id':[],
        'School':[],
        'First_Name':[],
        'Last_Name':[],
        'Email':[],
        'First':[],
        'Second':[],
        'Third':[],
        'Fourth':[],
        'Fifth':[],
        'Sixth':[],
        'Enough_Choices':[],
    }

    # fill table student records containing assigned YS clusters
    for id in df.id.unique():
        student_slice = df.loc[df.id == id]
        # id, School, First_Name, Last_name, Email
        criteria['id'].append(id)
        criteria['School'].append(school)
        criteria['First_Name'].append(student_slice.first_name.values[0])
        criteria['Last_Name'].append(student_slice.last_name.values[0])
        criteria['Email'].append(student_slice.email.values[0])

        # First Rank = 1st aptitude_fit
        c1 = student_slice[student_slice.fit_rank == 1]['aptitude_fit'].values[0]
        criteria['First'].append(c1)
        # Second Rank = 1st interest_fit
        c2 = student_slice[student_slice.fit_rank == 1]['interest_fit'].values[0]
        criteria['Second'].append(c2)
        # Third Rank = 2nd aptitude_fit
        c3 = student_slice[student_slice.fit_rank == 2]['aptitude_fit'].values[0]
        criteria['Third'].append(c3)
        # Fourth Rank = 3rd aptitude_fit
        c4 = student_slice[student_slice.fit_rank == 3]['aptitude_fit'].values[0]
        criteria['Fourth'].append(c4)
        # Fifth Rank = 2nd interest_fit
        c5 = student_slice[student_slice.fit_rank == 2]['interest_fit'].values[0]
        criteria['Fifth'].append(c5)
        # Sixth Rank = 3rd interest_fit
        c6 = student_slice[student_slice.fit_rank == 3]['interest_fit'].values[0]
        criteria['Sixth'].append(c6)

        # Flagging if less than 4 distinct ranked clusters
        check = set([c1, c2, c3, c4, c5, c6])
        if len(check) < 4:
            criteria['Enough_Choices'].append(0)
        else:
            criteria['Enough_Choices'].append(1)
        
    return criteria

# reads in info of students missing YS results and adds them to criteria dict obj returned by get_ranked_choices()
def add_students_missing_YS_results(criteria, school, event_year=event_year, n=pad_n_unassigned_places):
    # missing YS csv file path
    p = f'../YouScienceData/Missing_YS/{event_year}/{school}_missingYS.csv'
    df = pd.read_csv(p)

    # fill info for students and empty slots missing YS results
    for i in range(len(df)):
        student_slice = df.iloc[i]

        # appending known info
        criteria['id'].append(f'Missing {i+1}')
        criteria['School'].append(school)
        criteria['First_Name'].append(student_slice.First)
        criteria['Last_Name'].append(student_slice.Last)
        criteria['Email'].append(student_slice.Email)
        # appending missing indo
        criteria['First'].append(0)
        criteria['Second'].append(0)
        criteria['Third'].append(0)
        criteria['Fourth'].append(0)
        criteria['Fifth'].append(0)
        criteria['Sixth'].append(0)
        criteria['Enough_Choices'].append(0)

    # creating n unassigned slots padding for school
    for i in range(n):
        criteria['id'].append(f'Empty {i}') # format used later
        criteria['School'].append(school)
        criteria['First_Name'].append('Unassigned')
        criteria['Last_Name'].append(f'Open {i+1}')
        criteria['Email'].append(f'Open {i+1}')
        criteria['First'].append(0)
        criteria['Second'].append(0)
        criteria['Third'].append(0)
        criteria['Fourth'].append(0)
        criteria['Fifth'].append(0)
        criteria['Sixth'].append(0)
        criteria['Enough_Choices'].append(0)

    return criteria

In [4]:
# run
for school in schools:
    # build with known YS results
    criteria = get_ranked_choices(school=school)
    # add records without YS results
    criteria = add_students_missing_YS_results(criteria=criteria, school=school)

    # export file pathway
    criteria_pathway = f'{ys_criteria_p}{school} YSCriteria.csv'
    pd.DataFrame(criteria, columns=criteria.keys()).to_csv(criteria_pathway)
    