In [1]:
import pandas as pd
import numpy as np

In [None]:
"""
Objectives:
1) Determine the number of students per school that have 0-4 'Yes's (matches based on value system as is)
Procedure:
<> read in criteria per school
<> read in direct_join table
<> determine whether each choice has a POS match or not
<> determine the number matches per student as a feature

NOTE: Following schools do not have Skyward/YouScience info:
- Daniel McKee
- Eagleville
- Smyrna West
NOTE: Number of matches could be misleading if one of the choices from the ranking system are repeated
"""

In [3]:
# read in direct_join table
dj_table = pd.read_excel('../direct_join.xlsx')
missing = ['Daniel McKee', 'Eagleville', 'Smyrna West']
for scl in missing:
    dj_table.drop(scl, axis=1, inplace=True)


Unnamed: 0,YouScience Clusters,Blackman MS/HS,Christiana/Riverdale,Daniel McKee,Eagleville,LMS/HS,Oakland MS/HS,Rock Springs MS/HS,Rockvale MS / HS,Rocky Fork MS / Smyrna HS,Siegel MS / HS,Smyrna MS/HS,Smyrna West,Stewarts Creek MS/HS,Thurman Francis/ SHS,Whitworth/Riverdale HS
0,Business,Business Management,Business Management,Business Management,Office Management,,Business Management,"Business Management, Marketing Management/Entr...",Office Management,,"Business Management/Accounting, Marketing Mana...",,,"Business Management, Marketing Management/Entr...",,Business Management
1,Agriculture & Natural Resources,Horticulture Sciences,Agricultural Engineering & Applied Technologie...,Veterinary & Animal Science/Horticulture Science,"Agribusiness/Ag Mechanics, Horticulture/Vet Sc...",Horticulture Science,Agricultural Engineering & Applied Technologie...,Horticulture Sciences,Horticulture Sciences,Horticulture Science,Horticulture Sciences,Horticulture Science,,Horticulture Sciences,Horticulture Science,Agricultural Engineering & Applied Technologie...
2,Hospitality & Tourism,Culinary Arts,,,,,Culinary Arts,Culinary Arts,Culinary Arts,Culinary Arts,Culinary Arts,Culinary Arts,,Culinary Arts,Culinary Arts,
3,Engineering,STEM Engineering,"Automotive Maintenance and Light Repair, STEM:...",,Advanced STEM Applications,,"MEP Systems, Automotive Maintenance and Light ...","Advanced STEM Applications, Automotive Collisi...",MEP Systems,"MEP Systems, STEM Technology","STEM, Aviation flight","MEP Systems, STEM Technology",,"Advanced STEM Applications, Automotive Collisi...","MEP Systems, STEM Technology","Automotive Maintenance and Light Repair, STEM:..."
4,Teaching,,Teaching as a Profession,,,,Teaching as a Profession,,Teaching as a Profession,Teaching as a Profession,,Teaching as a Profession,,,Teaching as a Profession,Teaching as a Profession


In [8]:
# need to adjust school names
X = pd.read_csv('../Heatmaps/RCS_compiled.csv')
X.drop('Unnamed: 0', axis=1, inplace=True)
schools = list(set(X.organization))
# for i in range(len(schools)):
#     print(i, schools[i])

order = [0,9,11,8,5,1,2,6,7,3,10,4]
nschools = []
for i in order:
    nschools.append(schools[i])
nschools

['Blackman Middle School',
 'Christiana Middle School',
 'LaVergne Middle School',
 'Oakland Middle School',
 'Rock Springs Middle School',
 'Rockvale Middle School',
 'Rocky Fork Middle School',
 'Siegel Middle School',
 'Smyrna Middle School',
 'Stewarts Creek Middle School',
 'Thurman Francis Arts Academy',
 'Whitworth-Buchanan Middle School']

In [11]:
# Renaming dj_table columns for ease
cols = list(dj_table.columns)
cols.remove('YouScience Clusters')
# Renaming dict
renaming_dict = {}
for i in range(len(cols)):
    renaming_dict[cols[i]] = nschools[i]

renaming_dict

{'Blackman MS/HS': 'Blackman Middle School',
 'Christiana/Riverdale': 'Christiana Middle School',
 'LMS/HS': 'LaVergne Middle School',
 'Oakland MS/HS': 'Oakland Middle School',
 'Rock Springs MS/HS': 'Rock Springs Middle School',
 'Rockvale MS / HS': 'Rockvale Middle School',
 'Rocky Fork MS / Smyrna HS': 'Rocky Fork Middle School',
 'Siegel MS / HS': 'Siegel Middle School',
 'Smyrna MS/HS': 'Smyrna Middle School',
 'Stewarts Creek MS/HS': 'Stewarts Creek Middle School',
 'Thurman Francis/ SHS': 'Thurman Francis Arts Academy',
 'Whitworth/Riverdale HS': 'Whitworth-Buchanan Middle School'}

In [None]:
# confirming dj_table is prepared
dj_table.rename(renaming_dict,axis=1,inplace=True)
dj_table.to_excel('../direct_join_prepared.xlsx')
dj_table.head()

In [79]:
def get_yes_no(df):
    match_dict = {
        'id': [],
        'M1': [],
        'M2': [],
        'M3': [],
        'M4': [],
        'M5': [],
        'M6': [],
        'Total Matches': [],
        'Total Unique Matches': [],
        'Unique Matches': []
    }
    for i in range(len(df)):
        x = df.iloc[i]
        match_dict['id'].append(x.id)
        m1, m2, m3 = int(x.First in matches), int(x.Second in matches), int(x.Third in matches)
        m4, m5, m6 = int(x.Fourth in matches), int(x.Fifth in matches), int(x.Sixth in matches)
        match_dict['M1'].append(m1)
        match_dict['M2'].append(m2)
        match_dict['M3'].append(m3)
        match_dict['M4'].append(m4)
        match_dict['M5'].append(m5)
        match_dict['M6'].append(m6)
        match_dict['Total Matches'].append(sum([m1,m2,m3,m4,m5,m6]))

        # checking if matches are unique
        pos = ['First', 'Second', 'Third', 'Fourth', 'Fifth', 'Sixth']
        check = []
        c = 0
        for i in range(len(pos)):
            cmd = 'm' + str(i+1)
            # check for a matched fit
            if eval(cmd) == 1:
                # check if already tallied
                if x[pos[i]] not in check:
                    check.append(x[pos[i]])
                else:
                    c += 1
        total = len(check)
        match_dict['Total Unique Matches'].append(total)
        if c == 0:
            match_dict['Unique Matches'].append('Yes')
        else:
            match_dict['Unique Matches'].append('No')
        

    return pd.DataFrame(match_dict)


In [93]:
school_matchlist = {
        'School':[],
        '0 Matches': [],
        '1 Matches': [],
        '2 Matches': [],
        '3 Matches': [],
        '4 Matches': []
    }
# read in the school > determine which criteria have matches
fits = list(dj_table['YouScience Clusters'])
for scl in nschools:
    # compile list of matches
    temp = list(dj_table[scl][:16])
    matches = []
    for i in range(len(temp)):
        if str(temp[i]) != 'nan':
            matches.append(fits[i])
    
    df = pd.read_csv('YS_Criteria_by_School/' + scl + ' YSCriteria.csv').drop('Unnamed: 0', axis=1)
    Y = get_yes_no(df)
    
    school_matchlist['School'].append(scl)
    for i in range(5):
        cmd = str(i) + ' Matches'
        if i != 4:
            school_matchlist[cmd].append(len(Y.loc[Y['Total Unique Matches'] == i]))
        else:
            school_matchlist[cmd].append(len(Y.loc[Y['Total Unique Matches'] >= i]))
    
    

In [95]:
pd.DataFrame(school_matchlist).to_excel('Matches_per_School.xlsx')