## Data Cleaning for Slope Chart

### Sources:
  * https://patch.com/pennsylvania/across-pa/these-are-best-school-districts-pa-new-ranking-says
  * 2018-2019 Keystone Results file
  * 2018-2019 Per Pupil Expenditure file

In [1]:
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup

In [2]:
# Set up Splinter
browser = Browser('chrome')

In [3]:
# Visit the Patch "Top 25 Districts" article
url = "https://patch.com/pennsylvania/across-pa/these-are-best-school-districts-pa-new-ranking-says"
browser.visit(url)

In [4]:
# Parse the website
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [5]:
# Create a variable for the main area of the list
schools_list=soup.find("ol").find_all("li")
print(schools_list)

[<li><a href="https://www.niche.com/k12/d/radnor-township-school-district-pa/" rel="nofollow" target="_blank">Radnor Township School District</a>, Wayne: A+</li>, <li><a href="https://www.niche.com/k12/d/tredyffrin-easttown-school-district-pa/" rel="nofollow" target="_blank">Tredyffrin-Easttown School District</a>, Wayne: A+</li>, <li><a href="https://www.niche.com/k12/d/north-allegheny-school-district-pa/" rel="nofollow" target="_blank">North Allegheny School District</a>, Pittsburgh: A+</li>, <li><a href="https://www.niche.com/k12/d/lower-merion-school-district-pa/" rel="nofollow" target="_blank">Lower Merion School District</a>, Ardmore: A+</li>, <li><a href="https://www.niche.com/k12/d/mt-lebanon-school-district-pa/" rel="nofollow" target="_blank">Mt. Lebanon School District</a>, Pittsburgh: A+</li>, <li><a href="https://www.niche.com/k12/d/upper-st-clair-school-district-pa/" rel="nofollow" target="_blank">Upper St. Clair School District</a>, Pittsburgh: A+</li>, <li><a href="https

In [6]:
# Use list comprehension to loop through returned list above and split text into 3 columns
# Load split data into data frame
# Code created with help of TA Adam Siens
article_top_25_df=pd.DataFrame([[i.strip() for i in [s.text.split(',')[0], *s.text.split(',')[1].split(':')]] for s in schools_list])

# Rename columns
article_top_25_df=article_top_25_df.rename(columns={0:"School District",1:"City",2:"Rating"})
article_top_25_df

Unnamed: 0,School District,City,Rating
0,Radnor Township School District,Wayne,A+
1,Tredyffrin-Easttown School District,Wayne,A+
2,North Allegheny School District,Pittsburgh,A+
3,Lower Merion School District,Ardmore,A+
4,Mt. Lebanon School District,Pittsburgh,A+
5,Upper St. Clair School District,Pittsburgh,A+
6,Fox Chapel Area School District,Pittsburgh,A+
7,South Fayette Township School District,McDonald,A+
8,Derry Township School District,Hershey,A+
9,State College Area School District,State College,A+


In [7]:
# Import Excel files for per pupil expenditures
xls=pd.ExcelFile("Resources/Full 2018-2019 Per Pupil Expenditures.xlsx")

# Create data frame (this df includes charter schools)
ppe_2019_df=pd.read_excel(xls, sheet_name='ESSA Exp per ADM LEA 2018-19')
ppe_2019_df.head()

Unnamed: 0,AUN,LEA Name,County,2018-2019 Local Personnel expenditures,2018-2019 Local NonPersonnel expenditures,2018-2019 State Personnel expenditures,2018-2019 State NonPersonnel expenditures,2018-2019 Federal Personnel expenditures,2018-2019 Federal NonPersonnel expenditures,2018-2019\nADM,2018-2019 Local Per Pupil Expenditure,2018-2019 Local NonPer Pupil Expenditure,2018-2019 State Per Pupil Expenditure,2018-2019 State NonPer Pupil Expenditure,2018-2019 Federal Per Pupil Expenditure,2018-2019 Federal NonPer Pupil Expenditure
0,101260303,Albert Gallatin Area SD,Fayette,9019178.9,1875359.59,24385187.4,5070416.69,1652730.61,259155.65,3197.265,2820.9,586.55,7626.89,1585.86,516.92,81.06
1,101260803,Brownsville Area SD,Fayette,2935542.0,480648.0,7677570.0,1257079.0,677433.0,110918.0,1521.309,1929.62,315.94,5046.69,826.31,445.3,72.91
2,101261302,Connellsville Area SD,Fayette,12683500.82,5306622.67,31679118.11,13254095.21,1858869.96,777728.62,3954.586,3207.29,1341.89,8010.73,3351.58,470.05,196.66
3,101262903,Frazier SD,Fayette,4459973.09,1759801.7,7194693.97,2838858.99,235423.8,92892.76,1065.209,4186.95,1652.07,6754.26,2665.07,221.01,87.21
4,101264003,Laurel Highlands SD,Fayette,15197925.0,19714162.0,30380386.0,3830566.49,869577.0,880731.0,2720.162,5587.14,7247.42,11168.59,1408.21,319.68,323.78


In [8]:
# Add column for total per pupil expenditures
ppe_2019_df["Total Per Pupil Expenditure 2018-19"]= \
    ppe_2019_df["2018-2019 Local Per Pupil Expenditure"] + \
    ppe_2019_df["2018-2019 Local NonPer Pupil Expenditure"] + \
    ppe_2019_df["2018-2019 State Per Pupil Expenditure"] + \
    ppe_2019_df["2018-2019 State NonPer Pupil Expenditure"] + \
    ppe_2019_df["2018-2019 Federal Per Pupil Expenditure"] + \
    ppe_2019_df["2018-2019 Federal NonPer Pupil Expenditure"]
ppe_2019_df.head()

Unnamed: 0,AUN,LEA Name,County,2018-2019 Local Personnel expenditures,2018-2019 Local NonPersonnel expenditures,2018-2019 State Personnel expenditures,2018-2019 State NonPersonnel expenditures,2018-2019 Federal Personnel expenditures,2018-2019 Federal NonPersonnel expenditures,2018-2019\nADM,2018-2019 Local Per Pupil Expenditure,2018-2019 Local NonPer Pupil Expenditure,2018-2019 State Per Pupil Expenditure,2018-2019 State NonPer Pupil Expenditure,2018-2019 Federal Per Pupil Expenditure,2018-2019 Federal NonPer Pupil Expenditure,Total Per Pupil Expenditure 2018-19
0,101260303,Albert Gallatin Area SD,Fayette,9019178.9,1875359.59,24385187.4,5070416.69,1652730.61,259155.65,3197.265,2820.9,586.55,7626.89,1585.86,516.92,81.06,13218.18
1,101260803,Brownsville Area SD,Fayette,2935542.0,480648.0,7677570.0,1257079.0,677433.0,110918.0,1521.309,1929.62,315.94,5046.69,826.31,445.3,72.91,8636.77
2,101261302,Connellsville Area SD,Fayette,12683500.82,5306622.67,31679118.11,13254095.21,1858869.96,777728.62,3954.586,3207.29,1341.89,8010.73,3351.58,470.05,196.66,16578.2
3,101262903,Frazier SD,Fayette,4459973.09,1759801.7,7194693.97,2838858.99,235423.8,92892.76,1065.209,4186.95,1652.07,6754.26,2665.07,221.01,87.21,15566.57
4,101264003,Laurel Highlands SD,Fayette,15197925.0,19714162.0,30380386.0,3830566.49,869577.0,880731.0,2720.162,5587.14,7247.42,11168.59,1408.21,319.68,323.78,26054.82


In [9]:
# Remove unnecessary columns
ppe_2019_clean_df=ppe_2019_df[["AUN","LEA Name","Total Per Pupil Expenditure 2018-19"]]
ppe_2019_clean_df.head()

Unnamed: 0,AUN,LEA Name,Total Per Pupil Expenditure 2018-19
0,101260303,Albert Gallatin Area SD,13218.18
1,101260803,Brownsville Area SD,8636.77
2,101261302,Connellsville Area SD,16578.2
3,101262903,Frazier SD,15566.57
4,101264003,Laurel Highlands SD,26054.82


In [10]:
# Import Excel file for scores
scores_df=pd.read_excel("Resources/2019 Keystone Exams School Level Data.xlsx")
scores_df.head()

Unnamed: 0,Grade,AUN,School Number,County,District Name,School Name,Subject,Group,Number Scored,Percent Advanced,Percent Proficient,Percent Basic,Percent Below Basic
0,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Algebra I,All Students,139,27.3,45.3,21.6,5.8
1,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Algebra I,Historically Underperforming,47,14.9,29.8,38.3,17.0
2,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Biology,All Students,139,25.9,38.1,23.7,12.2
3,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Biology,Historically Underperforming,48,14.6,25.0,33.3,27.1
4,11,112011103,3,Adams,BERMUDIAN SPRINGS SD,BERMUDIAN SPRINGS HS,Literature,All Students,139,7.2,71.2,16.5,5.0


In [11]:
# Remove rows for historically underperforming students
all_students_scores_df=scores_df.loc[scores_df["Group"]=="All Students"].reset_index()

# Remove unnecessary columns
all_students_scores_clean_df=all_students_scores_df[["AUN","District Name","Subject","Group","Number Scored", \
                                                    "Percent Advanced","Percent Proficient", \
                                                    "Percent Basic", "Percent Below Basic"]]
all_students_scores_clean_df.head()


Unnamed: 0,AUN,District Name,Subject,Group,Number Scored,Percent Advanced,Percent Proficient,Percent Basic,Percent Below Basic
0,112011103,BERMUDIAN SPRINGS SD,Algebra I,All Students,139,27.3,45.3,21.6,5.8
1,112011103,BERMUDIAN SPRINGS SD,Biology,All Students,139,25.9,38.1,23.7,12.2
2,112011103,BERMUDIAN SPRINGS SD,Literature,All Students,139,7.2,71.2,16.5,5.0
3,112011603,CONEWAGO VALLEY SD,Algebra I,All Students,270,28.5,43.7,23.3,4.4
4,112011603,CONEWAGO VALLEY SD,Biology,All Students,264,31.1,40.5,18.6,9.8


In [12]:
# Create a dataframe for just Algebra 1 scores
algebra_scores_df=all_students_scores_clean_df.loc[all_students_scores_clean_df["Subject"]=="Algebra I"].reset_index()

# Create a column for overall proficiency
algebra_scores_df["Algebra I Overall Proficiency"]=algebra_scores_df["Percent Advanced"]+algebra_scores_df["Percent Proficient"]

# Eliminate unnecessary columns
alg_scores_reduced_df=algebra_scores_df[["AUN","District Name","Algebra I Overall Proficiency"]]
alg_scores_reduced_df.head()

Unnamed: 0,AUN,District Name,Algebra I Overall Proficiency
0,112011103,BERMUDIAN SPRINGS SD,72.6
1,112011603,CONEWAGO VALLEY SD,72.2
2,112013054,FAIRFIELD AREA SD,72.3
3,112013753,GETTYSBURG AREA SD,74.6
4,112015203,LITTLESTOWN AREA SD,80.6


In [13]:
# Create a dataframe for just Biology scores
biology_scores_df=all_students_scores_clean_df.loc[all_students_scores_clean_df["Subject"]=="Biology"].reset_index()

# Create a column for overall proficiency
biology_scores_df["Biology Overall Proficiency"]=biology_scores_df["Percent Advanced"]+biology_scores_df["Percent Proficient"]

# Eliminate unnecessary columns
bio_scores_reduced_df=biology_scores_df[["AUN","District Name","Biology Overall Proficiency"]]
bio_scores_reduced_df.head()

Unnamed: 0,AUN,District Name,Biology Overall Proficiency
0,112011103,BERMUDIAN SPRINGS SD,64.0
1,112011603,CONEWAGO VALLEY SD,71.6
2,112013054,FAIRFIELD AREA SD,72.3
3,112013753,GETTYSBURG AREA SD,73.9
4,112015203,LITTLESTOWN AREA SD,82.7


In [14]:
# Create a dataframe for just Literature scores
literature_scores_df=all_students_scores_clean_df.loc[all_students_scores_clean_df["Subject"]=="Literature"].reset_index()

# Create a column for overall proficiency
literature_scores_df["Literature Overall Proficiency"]=literature_scores_df["Percent Advanced"]+literature_scores_df["Percent Proficient"]

# Eliminate unnecessary columns
lit_scores_reduced_df=literature_scores_df[["AUN","District Name","Literature Overall Proficiency"]]
lit_scores_reduced_df.head()

Unnamed: 0,AUN,District Name,Literature Overall Proficiency
0,112011103,BERMUDIAN SPRINGS SD,78.4
1,112011603,CONEWAGO VALLEY SD,70.1
2,112013054,FAIRFIELD AREA SD,73.7
3,112013753,GETTYSBURG AREA SD,85.5
4,112015203,LITTLESTOWN AREA SD,82.8


In [15]:
# Merge Algebra I, Biology, and Literature overall proficiency data frames
proficiency_df=alg_scores_reduced_df.merge(bio_scores_reduced_df, on=("AUN","District Name"), how="inner").merge(lit_scores_reduced_df, on=("AUN","District Name"), how="inner")
proficiency_df

# Merge overall proficency with per pupil expenditure
value_measure_df=proficiency_df.merge(ppe_2019_clean_df, on="AUN", how="inner")

# Remove LEA Name column
value_measure_df=value_measure_df.drop("LEA Name", axis=1)

# Create column for average overall proficency across subjects
value_measure_df["Average Overall Proficiency"]=(value_measure_df["Algebra I Overall Proficiency"]+ \
                                                 value_measure_df["Biology Overall Proficiency"]+ \
                                                 value_measure_df["Literature Overall Proficiency"])/3

# Create column for value ratio
value_measure_df["Value Ratio"]=value_measure_df["Average Overall Proficiency"]/value_measure_df["Total Per Pupil Expenditure 2018-19"]

# Sort data by highest ratio
value_measure_sorted_df=value_measure_df.sort_values(by="Value Ratio", ascending=False)
value_measure_sorted_df

Unnamed: 0,AUN,District Name,Algebra I Overall Proficiency,Biology Overall Proficiency,Literature Overall Proficiency,Total Per Pupil Expenditure 2018-19,Average Overall Proficiency,Value Ratio
1787,105250001,PERSEUS HOUSE CS OF EXCELLENCE,13.5,15.7,17.2,0.00,15.466667,inf
2160,126513420,NEW FOUNDATIONS CS,52.6,43.5,61.5,0.00,52.533333,inf
165662,101638803,WASHINGTON SD,53.6,37.1,65.3,0.00,52.000000,inf
165520,126519434,UNIVERSAL AUDENRIED CHARTER SCHOOL,16.0,19.3,45.7,0.00,27.000000,inf
1901,104377003,UNION AREA SD,66.6,77.1,79.1,0.00,74.266667,inf
...,...,...,...,...,...,...,...,...
165511,126515001,PHILADELPHIA CITY SD,,,45.0,12758.61,,
165512,126515001,PHILADELPHIA CITY SD,,,17.2,12758.61,,
165513,126515001,PHILADELPHIA CITY SD,,,,12758.61,,
165523,109530304,AUSTIN AREA SD,72.8,72.7,,21904.98,,


In [17]:
# Quit the browsing session
browser.quit()