### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [1]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = '/Users/rollycostillas/Desktop/upenn/homework/myrepository/04-pandas-challenge/Resources/schools_complete.csv'
student_data_to_load = '/Users/rollycostillas/Desktop/upenn/homework/myrepository/04-pandas-challenge/Resources/students_complete.csv'

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Add column for per student budget to the school_data DataFrame
school_data["Per Student Budget"] = school_data["budget"] / school_data["size"]

# Add column for overall passing to the student_data DataFrame based on the student's math & reading scores
student_data["Overall Passing"] = False
student_data.loc[(student_data["math_score"] >= 70) & (student_data["reading_score"] >= 70), "Overall Passing"] = True

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

FileNotFoundError: [Errno 2] No such file or directory: '/Users/rollycostillas/Desktop/upenn/homework/myrepository/04-pandas-challenge/Resources/schools_complete.csv'

## District Summary

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Calculate the percentage of students who passed math **and** reading (% Overall Passing)

* Create a dataframe to hold the above results

* Optional: give the displayed data cleaner formatting

In [None]:
# calculate the total number of schools
total_schools = len(school_data["school_name"])

# calculate the total number of students
total_students = len(school_data_complete["Student ID"])

In [None]:
# calculate the total budget
total_budget = "${:,.2f}".format(school_data["budget"].sum())

In [None]:
# calculate the average math score
avg_math = round(school_data_complete["math_score"].mean(), 6)

In [None]:
# calculate the average reading score
avg_reading = round(school_data_complete["reading_score"].mean(), 6)

In [None]:
# filter to find number of students who passed math
math_pass_count = len(school_data_complete.loc[(school_data_complete["math_score"] >= 70), :])

# filter to find number of students who passed reading
reading_pass_count = len(school_data_complete.loc[(school_data_complete["reading_score"] >= 70), :])

# sum values of Overall Passing to get a count of students passing both (True = 1, False = 0)
both_pass_count = school_data_complete["Overall Passing"].sum()

# calculate the percent passing in each category by dividing by total students & multiply by 100; also format
math_pass = "{:.6f}".format((math_pass_count / total_students) * 100)
reading_pass = "{:.6f}".format((reading_pass_count / total_students) * 100)
both_pass = "{:.6f}".format((both_pass_count / total_students) * 100)

# format total students that were done using for calculations
total_students = "{:,.0f}".format(total_students)

In [None]:
# create a list of dictionaries to input all values into a dataframe
district_summary_df = pd.DataFrame([{"Total Schools": total_schools, "Total Students": total_students,
               "Total Budget": total_budget, "Average Math Score": avg_math, 
               "Average Reading Score": avg_reading, "% Passing Math": math_pass, 
               "% Passing Reading": reading_pass, "% Overall Passing": both_pass}])

# show dataframe output
district_summary_df

## School Summary

* Create an overview table that summarizes key metrics about each school, including:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * % Overall Passing (The percentage of students that passed math **and** reading.)
  
* Create a dataframe to hold the above results

In [None]:
# get School Name, School Type, Total Students, Total School Budget, Per Student Budget columns from base school_data dataframe
school_summary_df = school_data.iloc[:,1:6]

# rename columns for better descriptions
school_summary_df = school_summary_df.rename(columns={"type":"School Type","size":"Total Students","budget":"Total School Budget"})

In [None]:
# sort dataframe by school_name (ascending)
school_summary_df = school_summary_df.sort_values("school_name")

In [None]:
# group all school data by school_name
school_calc_group = school_data_complete.groupby(["school_name"])

# calculate the average math & reading scores for the groupby
avg_scores = school_calc_group[["math_score","reading_score"]].mean()

# rename columns for better descriptions
avg_scores = avg_scores.rename(columns={"math_score":"Average Math Score","reading_score":"Average Reading Score"})

# merge the basic school data and average scores to one dataframe
school_summary_df = pd.merge(school_summary_df, avg_scores, how="left", on="school_name")

In [None]:
# calculate the percent passing math & reading individually using lambda function on the groupby
pass_percs = school_calc_group[["math_score","reading_score"]].apply(lambda x: ((x>=70).sum()) / x.count() * 100)

# rename columns for better descriptions
pass_percs = pass_percs.rename(columns={"math_score":"% Passing Math","reading_score":"% Passing Reading"})

# merge basic school data + average scores with the individual passing percentages
school_summary_df = pd.merge(school_summary_df, pass_percs, how="left", on="school_name")

In [None]:
# calculate overall passing percentage on the groupby 
# sum only counts True values, count counts both True and False values
both_pass = school_calc_group["Overall Passing"].sum() / school_calc_group["Overall Passing"].count() * 100

In [None]:
# merge basic school data + average scores + individual passing percentages with overall passing percentage
school_summary_df = pd.merge(school_summary_df, both_pass, how="left", on="school_name")

# rename columns for better descriptions
school_summary_df = school_summary_df.rename(columns={"Overall Passing":"% Overall Passing"})

In [None]:
# format all columns as needed
school_summary_df["Total School Budget"] = school_summary_df["Total School Budget"].map("${:,.2f}".format)
school_summary_df["Per Student Budget"] = school_summary_df["Per Student Budget"].map("${:,.2f}".format)
school_summary_df["Average Math Score"] = school_summary_df["Average Math Score"].map("{:.6f}".format)
school_summary_df["Average Reading Score"] = school_summary_df["Average Reading Score"].map("{:.6f}".format)
school_summary_df["% Passing Math"] = school_summary_df["% Passing Math"].map("{:.6f}".format)
school_summary_df["% Passing Reading"] = school_summary_df["% Passing Reading"].map("{:.6f}".format)
school_summary_df["% Overall Passing"] = school_summary_df["% Overall Passing"].map("{:.6f}".format)

In [None]:
# rename columns for better descriptions
school_summary_df = school_summary_df.rename(columns={"school_name":"School Name"})

# set School Name to index
school_summary_df = school_summary_df.set_index("School Name",drop=True)
school_summary_df

## Top Performing Schools (By % Overall Passing)

* Sort and display the top five performing schools by % overall passing.

In [None]:
# Sort and display the top five performing schools by % overall passing
top_performing_df = school_summary_df.sort_values("% Overall Passing", ascending = False)
top_performing_df.head()

## Bottom Performing Schools (By % Overall Passing)

* Sort and display the five worst-performing schools by % overall passing.

In [None]:
# sort and display the five worst-performing schools by % overall passing.
bottom_performing_df = school_summary_df.sort_values("% Overall Passing")
bottom_performing_df.head()

## Math Scores by Grade

* Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.

  * Create a pandas series for each grade. Hint: use a conditional statement.
  
  * Group each series by school
  
  * Combine the series into a dataframe
  
  * Optional: give the displayed data cleaner formatting

In [None]:
# group by school and grade, find math score average
math_by_grades_df = school_data_complete.groupby(["school_name","grade"])["math_score"] \
                        .mean() \
                        .map("{:.6f}".format) \
                        .unstack() \
                        .rename_axis(None, axis=1)
# rename the index
math_by_grades_df.index.names = ["School Name"]

# reorder the columns
math_by_grades_df = math_by_grades_df[["9th","10th","11th","12th"]]
math_by_grades_df

## Reading Score by Grade 

* Perform the same operations as above for reading scores

In [None]:
# group by school and grade, find reading score average
reading_by_grades_df = school_data_complete.groupby(["school_name","grade"])["reading_score"] \
                        .mean() \
                        .map("{:.6f}".format) \
                        .unstack() \
                        .rename_axis(None, axis=1)

# rename the index
reading_by_grades_df.index.names = ["School Name"]

# reorder the columns
reading_by_grades_df = reading_by_grades_df[["9th","10th","11th","12th"]]
reading_by_grades_df

## Scores by School Spending

* Create a table that breaks down school performances based on average Spending Ranges (Per Student). Use 4 reasonable bins to group school spending. Include in the table each of the following:
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [None]:
# set bins
bins = [0, 583.99, 628.99, 643.99, 674.99]

# set bin labels
bin_names = ["<\$584", "\$585-\$629", "\$630-\$644", "\$645-\$675"]

In [None]:
# store select columns from complete data in new df for manipulation
school_spending_df = school_data_complete.loc[:,["math_score","reading_score","Overall Passing","Per Student Budget"]]

In [None]:
# cut into spending range bins & create new column
school_spending_df["Spending Ranges (Per Student)"] = pd.cut(school_spending_df["Per Student Budget"], bins, labels=bin_names, include_lowest=True)

In [None]:
# set Spending Range column as index
school_spending_df = school_spending_df.set_index("Spending Ranges (Per Student)",drop=True)

# group df by Spending Range
school_spending_group = school_spending_df.groupby("Spending Ranges (Per Student)")

In [None]:
# calculate average math & reading scores for the groupby & rename columns for better descriptions
school_spending_avgs_df = school_spending_group[["math_score","reading_score"]].mean() \
                            .rename(columns={"math_score":"Average Math Score","reading_score":"Average Reading Score"})

In [None]:
# calculate the percent passing math & reading individually using lambda function on the groupby & rename coluumns for better descriptions
school_spending_pass_df = school_spending_group[["math_score","reading_score"]].apply(lambda x: ((x>=70).sum()) / x.count() * 100) \
                            .rename(columns={"math_score":"% Passing Math","reading_score":"% Passing Reading"})

# merge average scores with the individual passing percentages
school_spending_summary_df = pd.merge(school_spending_avgs_df, school_spending_pass_df, how="left", on="Spending Ranges (Per Student)")

In [None]:
# calculate overall passing percentage on the groupby 
# sum only counts True values, count counts both True and False values
overall_pass = school_spending_group["Overall Passing"].sum() / school_spending_group["Overall Passing"].count() * 100

# merge average scores + individual passing percentages with overall passing percentage & rename columns for better descriptions
school_spending_summary_df = pd.merge(school_spending_summary_df, overall_pass, how="left", on="Spending Ranges (Per Student)") \
                                .rename(columns={"Overall Passing":"% Overall Passing"})

In [None]:
# format all columns as needed
school_spending_summary_df["Average Math Score"] = school_spending_summary_df["Average Math Score"].map("{:.2f}".format)
school_spending_summary_df["Average Reading Score"] = school_spending_summary_df["Average Reading Score"].map("{:.2f}".format)
school_spending_summary_df["% Passing Math"] = school_spending_summary_df["% Passing Math"].map("{:.2f}".format)
school_spending_summary_df["% Passing Reading"] = school_spending_summary_df["% Passing Reading"].map("{:.2f}".format)
school_spending_summary_df["% Overall Passing"] = school_spending_summary_df["% Overall Passing"].map("{:.2f}".format)

# show the dataframe output
school_spending_summary_df

## Scores by School Size

* Perform the same operations as above, based on school size.

In [None]:
# set bins
bins = [0, 999.9, 1999.9, 5000]

# set bin labels
bin_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

In [None]:
# store select columns from complete data in new df for manipulation
school_size_df = school_data_complete.loc[:,["math_score","reading_score","Overall Passing","size"]]

# cut into school size bins & create new column
school_size_df["School Size"] = pd.cut(school_size_df["size"], bins, labels=bin_names, include_lowest=True)

# create group by of school size bins
school_size_group = school_size_df.groupby("School Size")

In [None]:
# calculate average math & reading scores for the groupby & rename columns for better descriptions
size_grades_avg_df = school_size_group[["math_score","reading_score"]].mean() \
                        .rename(columns={"math_score":"Average Math Score","reading_score":"Average Reading Score"})

In [None]:
# calculate the percent passing math & reading individually using lambda function on the groupby & rename coluumns for better descriptions
size_pass_perc = school_size_group[["math_score","reading_score"]].apply(lambda x: ((x>=70).sum()) / x.count() * 100) \
                    .rename(columns={"math_score":"% Passing Math","reading_score":"% Passing Reading"})

# merge average scores with the individual passing percentages
school_size_summary_df = pd.merge(size_grades_avg_df, size_pass_perc, how="left", on="School Size")

In [None]:
# calculate overall passing percentage on the groupby 
# sum only counts True values, count counts both True and False values
size_both_pass = school_size_group["Overall Passing"].sum() / school_size_group["Overall Passing"].count() * 100

In [None]:
# merge average scores + individual passing percentages with overall passing percentage & rename columns for better descriptions
school_size_summary_df = pd.merge(school_size_summary_df, size_both_pass, how="left", on="School Size") \
                            .rename(columns={"Overall Passing":"% Overall Passing"})

In [None]:
# format all columns as needed
school_size_summary_df["Average Math Score"] = school_size_summary_df["Average Math Score"].map("{:.6f}".format)
school_size_summary_df["Average Reading Score"] = school_size_summary_df["Average Reading Score"].map("{:.6f}".format)
school_size_summary_df["% Passing Math"] = school_size_summary_df["% Passing Math"].map("{:.6f}".format)
school_size_summary_df["% Passing Reading"] = school_size_summary_df["% Passing Reading"].map("{:.6f}".format)
school_size_summary_df["% Overall Passing"] = school_size_summary_df["% Overall Passing"].map("{:.6f}".format)

# show the dataframe output
school_size_summary_df

## Scores by School Type

* Perform the same operations as above, based on school type

In [None]:
# store select columns from complete data in new df for manipulation & groupby type
school_type_group = school_data_complete[["math_score","reading_score","Overall Passing","type"]].groupby(["type"])

In [None]:
# find the average math and reading score for each school type & rename columns for better descriptions
school_type_avgs_df = school_type_group[["math_score","reading_score"]].mean() \
                        .rename(columns={"math_score":"Average Math Score","reading_score":"Average Reading Score"})

In [None]:
# find percent who pass math & reading individually using lambda function & rename columns for better descriptions
school_type_pass_df = school_type_group[["math_score","reading_score"]].apply(lambda x: ((x>=70).sum()) / x.count() * 100) \
                        .rename(columns={"math_score":"% Passing Math","reading_score":"% Passing Reading"})

In [None]:
# calculate percent of students who are passing overall in groupby
# sum only counts True values, count counts both True and False values
school_type_pass_perc = school_type_group["Overall Passing"].sum() / school_type_group["Overall Passing"].count() * 100

In [None]:
# create new dataframe by merging average grades & individual pass percentages
school_type_df = pd.merge(school_type_avgs_df, school_type_pass_df, on="type")

# merge overall pass percent into the new dataframe from above & reset the index
school_type_df = pd.merge(school_type_df, school_type_pass_perc, on="type").reset_index()

# rename columns for better descriptions & set School Type as index
school_type_df = school_type_df.rename(columns={"type":"School Type", "Overall Passing":"% Overall Passing"}).set_index("School Type")

In [None]:
# format all columns as needed
school_type_df["Average Math Score"] = school_type_df["Average Math Score"].map("{:.6f}".format)
school_type_df["Average Reading Score"] = school_type_df["Average Reading Score"].map("{:.6f}".format)
school_type_df["% Passing Math"] = school_type_df["% Passing Math"].map("{:.2f}".format)
school_type_df["% Passing Reading"] = school_type_df["% Passing Reading"].map("{:.6f}".format)
school_type_df["% Overall Passing"] = school_type_df["% Overall Passing"].map("{:.6f}".format)

# show dataframe output
school_type_df