In [123]:
# Import packages 
import pandas as pd
from fuzzywuzzy import process

In [124]:
# Import School List Data
school_list = pd.read_json("../../frontend/public/data/basic_school_information.json")

In [125]:
# Import Sports Data
sports = pd.read_pickle("sports_analysis/sports_3.pkl")

In [126]:
school_list = school_list.transpose()

In [127]:
# Filter list of schools for schools in sports dataset
choices = school_list[school_list.schoolCity.isin(['Calgary', 'Cochrane', 'Airdrie', 'Chestermere'])].schoolName

In [128]:
# Filter out writing centers and e-learning centers
choices_short = []
for i in choices:
    if 'writing' not in i.lower() and 'e-learning' not in i.lower():
        choices_short.append(i)

In [129]:
# Make into dataframe
choices_s = choices[choices.isin(choices_short)]

In [134]:
# Make a set of distinct school names from sports data set to fuzzy match.
#This is much faster than fuzzy matching every row
school_names = []
for i in set(sports.WINNER):
    school_names.append(i)

In [135]:
# Convert to pandas data frame
school_names = pd.DataFrame(school_names, columns = ['Original'])

In [137]:
# List of schools that are closed or are a college
schools_to_remove = ["Viscount Bennett", "Central Collegiate Instate", "Central Collegiate Institute", 
                     "St. Mary's College", "Commercial High School", "South Calgary High School", "East Calgary",
                     "Mount Royal College", "East Calgary High School", "East Calgary High"                    ]

In [138]:
# Create list of only open schools in sports dataset
open_schools = school_names[~school_names.Original.isin(schools_to_remove)]

In [139]:
# Create neccessary columns for matching step
open_schools.loc[:,'clean'] = ''
open_schools.loc[:,'match_percent'] = None
open_schools.loc[:,'School_code'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  open_schools.loc[:,'clean'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  open_schools.loc[:,'match_percent'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  open_schools.loc[:,'School_code'] = ''


In [142]:
# Perform fuzzy matching with school dataset
for i in open_schools.index:
    if open_schools['Original'][i].endswith('Lacombe'): # Manually match these first two schools that do not match proper.
        match = process.extractOne("Father Lacombe School", 
                               choices_s)
    elif open_schools['Original'][i] == 'St. Gabriel':
        match = process.extractOne("St. Gabriel the Archangel", 
                               choices_s)
    else:
        match = process.extractOne(open_schools.Original[i].replace("Bishop Grandin", "Our Lady of the Rockies High School")
                               .replace('Lacombe', "Father Lacombe")+ " High School", # Bishop Grandin was renamed
                               choices_s) # Lacombe does not match unless named as such.
    open_schools['clean'][i] = match[0] # High School added after every school to match format in school dataset
    open_schools['match_percent'][i] = match[1]
    open_schools['School_code'][i] = match[2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  open_schools['clean'][i] = match[0] # High School added after every school to match format in school dataset
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  open_schools['match_percent'][i] = match[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  open_schools['School_code'][i] = match[2]


In [144]:
# Now that we have the clean names and the corresponding school code lets join it with our data.
# First lets subset the columns we need, and name that appropriately
school_cleaner = pd.DataFrame({'WINNER' : open_schools.Original,
                              'WINNER_CLEAN' : open_schools.clean,
                              'School_code' : open_schools.School_code})

In [145]:
# Now to merge
sports_cleannames = pd.merge(school_cleaner, sports)

In [147]:
# Remove uncleaned name
sports_cleannames = sports_cleannames.drop(columns = 'WINNER')

In [148]:
sports_cleannames = sports_cleannames.rename(columns = {'WINNER_CLEAN' : 'WINNER'})

In [151]:
# Create total wins by school and year
total_wins = sports_cleannames.groupby(['WINNER', 'School_code', 'YEAR WON', 'SPORT']).agg(
    wins = pd.NamedAgg(column = 'WINNER', aggfunc = len))

In [152]:
# Total wins 2019-20 by school
wins201920 = total_wins.query("`YEAR WON` == '2019-20'").sort_values('wins', ascending = False)

In [153]:
# All wins sorted
sports_sorted = sports_cleannames.sort_values(['School_code', 'YEAR WON','SPORT', 'DIVISION','LEVEL', 'GENDER'], 
                                                           ascending = [True, False, True, True, True, True])