In [1]:
from pathlib import Path
import pandas as pd

In [2]:
# Reading in the excel file as a pandas DataFrame
excel_filepath = Path('Resources/2019 Keystone Exams School Level Data.xlsx')
keystone2019_df = pd.read_excel(excel_filepath, header=4)
keystone2019_df

Unnamed: 0,Grade,AUN,School Number,County,District Name,School Name,Subject,Group,Number Scored,Percent Advanced,Percent Proficient,Percent Basic,Percent Below Basic
0,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Algebra I,All Students,139,27.3,45.3,21.6,5.8
1,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Algebra I,Historically Underperforming,47,14.9,29.8,38.3,17.0
2,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Biology,All Students,139,25.9,38.1,23.7,12.2
3,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Biology,Historically Underperforming,48,14.6,25.0,33.3,27.1
4,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Literature,All Students,139,7.2,71.2,16.5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4481,11,112679403,4658,York,YORK SUBURBAN SD,YORK SUBURBAN SHS,Algebra I,Historically Underperforming,68,25.0,45.6,22.1,7.4
4482,11,112679403,4658,York,YORK SUBURBAN SD,YORK SUBURBAN SHS,Biology,All Students,220,53.2,36.4,8.2,2.3
4483,11,112679403,4658,York,YORK SUBURBAN SD,YORK SUBURBAN SHS,Biology,Historically Underperforming,69,24.6,47.8,20.3,7.2
4484,11,112679403,4658,York,YORK SUBURBAN SD,YORK SUBURBAN SHS,Literature,All Students,222,39.2,51.4,8.1,1.4


In [3]:
keystone2019_df = keystone2019_df.dropna(how='any')

In [4]:
# Filtering the data to show only 'All Students'
keystone2019_df = keystone2019_df.loc[keystone2019_df['Group'] == 'All Students']

In [5]:
subject_counts = keystone2019_df.groupby('School Name')['Subject'].transform('nunique')
indices_to_drop = subject_counts < 3
keystone2019_df.drop(keystone2019_df[indices_to_drop].index, inplace=True)

In [6]:
# Check the count of values for 'School Name' to match with 'Subject'
keystone2019_df['School Name'].value_counts()

CENTRAL HS              6
BERMUDIAN SPRINGS HS    3
POTTSGROVE SHS          3
LOWER MERION HS         3
LOWER MORELAND HS       3
                       ..
MAPLEWOOD JSHS          3
SAEGERTOWN JSHS         3
BIG SPRING HS           3
CAMP HILL SHS           3
YORK SUBURBAN SHS       3
Name: School Name, Length: 687, dtype: int64

In [7]:
# Check to match 'School Name', note that there are two Central HS so the values will be 688 each.
keystone2019_df['Subject'].value_counts()

Algebra I     688
Biology       688
Literature    688
Name: Subject, dtype: int64

In [8]:
# Function to calculate the sum of percent advanced and proficient
def percentSum(row):
    percent_proficient_above = row['Percent Advanced'] + row['Percent Proficient']
    row['Percent Advanced & Proficient'] = percent_proficient_above
    return row

In [9]:
keystone2019_cleaned = keystone2019_df.groupby(['School Name', 'Subject']).apply(percentSum)
keystone2019_cleaned

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  keystone2019_cleaned = keystone2019_df.groupby(['School Name', 'Subject']).apply(percentSum)


Unnamed: 0,Grade,AUN,School Number,County,District Name,School Name,Subject,Group,Number Scored,Percent Advanced,Percent Proficient,Percent Basic,Percent Below Basic,Percent Advanced & Proficient
0,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Algebra I,All Students,139,27.3,45.3,21.6,5.8,72.6
2,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Biology,All Students,139,25.9,38.1,23.7,12.2,64.0
4,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Literature,All Students,139,7.2,71.2,16.5,5.0,78.4
6,11,112011603,17,Adams,CONEWAGO VALLEY SD,NEW OXFORD SHS,Algebra I,All Students,270,28.5,43.7,23.3,4.4,72.2
8,11,112011603,17,Adams,CONEWAGO VALLEY SD,NEW OXFORD SHS,Biology,All Students,264,31.1,40.5,18.6,9.8,71.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4476,11,112679107,5168,York,YORK CO SCHOOL OF TECHNOLOGY,YORK CO SCHOOL OF TECHNOLOGY,Biology,All Students,374,14.4,46.0,30.2,9.4,60.4
4478,11,112679107,5168,York,YORK CO SCHOOL OF TECHNOLOGY,YORK CO SCHOOL OF TECHNOLOGY,Literature,All Students,381,10.5,60.9,22.3,6.3,71.4
4480,11,112679403,4658,York,YORK SUBURBAN SD,YORK SUBURBAN SHS,Algebra I,All Students,220,49.1,37.7,10.9,2.3,86.8
4482,11,112679403,4658,York,YORK SUBURBAN SD,YORK SUBURBAN SHS,Biology,All Students,220,53.2,36.4,8.2,2.3,89.6


In [10]:
# Export as both excel and JSON files
keystone2019_cleaned.to_excel('Resources/2019Keystone_School_cleaned.xlsx', index=False)
keystone2019_cleaned.to_json('Resources/2019Keystone_School_cleaned.json', orient='records')