In [1]:
# Dependencies and Setup
import pandas as pd

# File to Load
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load, encoding="utf-8")
student_data = pd.read_csv(student_data_to_load, encoding="utf-8")

In [2]:
# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

In [3]:
# Set District ID for groupby
school_data_complete["District ID"] = 0

# Calculate Per Student Budget
budget_per_student = school_data_complete["budget"]/school_data_complete["size"]
school_data_complete["budget_per_student"] = budget_per_student

District Summary

In [4]:
# Calculate District Summary
district_group = school_data_complete.groupby(["District ID"])
total_schools = district_group["school_name"].nunique()
total_students = district_group["Student ID"].nunique()
total_budget = school_data["budget"].sum()
avg_math_score = district_group["math_score"].mean()
avg_reading_score = district_group["reading_score"].mean()
passed_math = school_data_complete[school_data_complete["math_score"] >= 70].groupby(["District ID"]).size()
passing_math_rate = passed_math/total_students * 100
passed_reading = school_data_complete[school_data_complete["reading_score"] >= 70].groupby(["District ID"]).size()
passing_reading_rate = passed_reading/total_students * 100
passed_both = school_data_complete[(school_data_complete["math_score"] >= 70) & 
                                    (school_data_complete["reading_score"] >= 70)].groupby(["District ID"]).size()
overall_passing_rate = passed_both/total_students * 100

In [5]:
# District Summary dataframe
district_summary = pd.DataFrame({
    "Total Schools": total_schools,
    "Total Students": total_students,
    "Total Budget": total_budget,
    "Average Math Score": avg_math_score,
    "Average Reading Score": avg_reading_score,
    "% Passing Math": passing_math_rate,
    "% Passing Reading": passing_reading_rate,
    "% Overall Passing": overall_passing_rate
})

In [6]:
# Use Map to format columns
district_summary["Total Students"] = district_summary["Total Students"].map("{:,}".format)
district_summary["Total Budget"] = district_summary["Total Budget"].map("${:,}".format)
district_summary["Average Math Score"] = district_summary["Average Math Score"].map("{:.2f}".format)
district_summary["Average Reading Score"] = district_summary["Average Reading Score"].map("{:.2f}".format)
district_summary["% Passing Math"] = district_summary["% Passing Math"].map("{:.2f}".format)
district_summary["% Passing Reading"] = district_summary["% Passing Reading"].map("{:.2f}".format)
district_summary["% Overall Passing"] = district_summary["% Overall Passing"].map("{:.2f}".format)
district_summary.rename_axis(None, axis = 0)

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428",78.99,81.88,74.98,85.81,65.17


School Summary

In [7]:
# Calculate School Summary
school_group = school_data_complete.groupby(["School ID"])
school_name = school_group["school_name"].unique().str.get(0)
school_type = school_group["type"].unique().str.get(0)
total_students = school_group.size().astype(int)
total_budget = school_group["budget"].unique().astype(int)
budget_per_student = school_group["budget_per_student"].unique().astype(int)
avg_math_score = school_group["math_score"].mean()
avg_reading_score = school_group["reading_score"].mean()
passed_math = school_data_complete[school_data_complete["math_score"] >= 70].groupby(["School ID"]).size()
passing_math_rate = passed_math/total_students * 100
passed_reading = school_data_complete[school_data_complete["reading_score"] >= 70].groupby(["School ID"]).size()
passing_reading_rate = passed_reading/total_students * 100
passed_both = school_data_complete[(school_data_complete["math_score"] >= 70) & 
                                    (school_data_complete["reading_score"] >= 70)].groupby(["School ID"]).size()
overall_passing_rate = passed_both/total_students * 100

In [8]:
# School Summary dataframe
school_summary = pd.DataFrame({
    "School": school_name,
    "School Type": school_type,
    "Total Students": total_students,
    "Total School Budget": total_budget,
    "Per Student Budget": budget_per_student,
    "Average Math Score": avg_math_score,
    "Average Reading Score": avg_reading_score,
    "% Passing Math": passing_math_rate,
    "% Passing Reading": passing_reading_rate,
    "% Overall Passing": overall_passing_rate
})
school_summary = school_summary.sort_values("School",ascending=True)

In [9]:
# Use the School Summary dataframe for the Scores analyses below
spend_df=school_summary.copy()
size_df=school_summary.copy()
type_df=school_summary.copy()

In [10]:
# Remove unecessary columns from the DataFrame and save the new DataFrame
spend_df = school_summary[["Per Student Budget", "Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading",
                       "% Overall Passing"]]
size_df = school_summary[["Total Students", "Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading",
                       "% Overall Passing"]]
type_df = school_summary[["School Type", "Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading",
                       "% Overall Passing"]]

In [11]:
# Use Map to format columns
school_summary["Total Students"] = school_summary["Total Students"].map("{:,}".format)
school_summary["Total School Budget"] = school_summary["Total School Budget"].map("${:,}".format)
school_summary["Per Student Budget"] = school_summary["Per Student Budget"].map("${:.2f}".format)
school_summary["Average Math Score"] = school_summary["Average Math Score"].map("{:.2f}".format)
school_summary["Average Reading Score"] = school_summary["Average Reading Score"].map("{:.2f}".format)
school_summary["% Passing Math"] = school_summary["% Passing Math"].map("{:.2f}".format)
school_summary["% Passing Reading"] = school_summary["% Passing Reading"].map("{:.2f}".format)
school_summary["% Overall Passing"] = school_summary["% Overall Passing"].map("{:.2f}".format)
school_summary.rename_axis(None, axis = 0)

Unnamed: 0,School,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
7,Bailey High School,District,4976,"$3,124,928",$628.00,77.05,81.03,66.68,81.93,54.64
6,Cabrera High School,Charter,1858,"$1,081,356",$582.00,83.06,83.98,94.13,97.04,91.33
1,Figueroa High School,District,2949,"$1,884,411",$639.00,76.71,81.16,65.99,80.74,53.2
13,Ford High School,District,2739,"$1,763,916",$644.00,77.1,80.75,68.31,79.3,54.29
4,Griffin High School,Charter,1468,"$917,500",$625.00,83.35,83.82,93.39,97.14,90.6
3,Hernandez High School,District,4635,"$3,022,020",$652.00,77.29,80.93,66.75,80.86,53.53
8,Holden High School,Charter,427,"$248,087",$581.00,83.8,83.81,92.51,96.25,89.23
0,Huang High School,District,2917,"$1,910,635",$655.00,76.63,81.18,65.68,81.32,53.51
12,Johnson High School,District,4761,"$3,094,650",$650.00,77.07,80.97,66.06,81.22,53.54
9,Pena High School,Charter,962,"$585,858",$609.00,83.84,84.04,94.59,95.95,90.54


 Top Performing Schools (By % Overall Passing)

In [12]:
# Top Performing Schools (By % Overall Passing)
school_top5 = pd.DataFrame({
    "School": school_name,
    "School Type": school_type,
    "Total Students": total_students,
    "Total School Budget": total_budget,
    "Per Student Budget": budget_per_student,
    "Average Math Score": avg_math_score,
    "Average Reading Score": avg_reading_score,
    "% Passing Math": passing_math_rate,
    "% Passing Reading": passing_reading_rate,
    "% Overall Passing": overall_passing_rate
})
school_top5 = school_top5.nlargest(5,'% Overall Passing')

In [13]:
# Use Map to format columns
school_top5["Total Students"] = school_top5["Total Students"].map("{:,}".format)
school_top5["Total School Budget"] = school_top5["Total School Budget"].map("${:,}".format)
school_top5["Per Student Budget"] = school_top5["Per Student Budget"].map("${:.2f}".format)
school_top5["Average Math Score"] = school_top5["Average Math Score"].map("{:.2f}".format)
school_top5["Average Reading Score"] = school_top5["Average Reading Score"].map("{:.2f}".format)
school_top5["% Passing Math"] = school_top5["% Passing Math"].map("{:.2f}".format)
school_top5["% Passing Reading"] = school_top5["% Passing Reading"].map("{:.2f}".format)
school_top5["% Overall Passing"] = school_top5["% Overall Passing"].map("{:.2f}".format)
school_top5.rename_axis(None, axis = 0)

Unnamed: 0,School,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
6,Cabrera High School,Charter,1858,"$1,081,356",$582.00,83.06,83.98,94.13,97.04,91.33
14,Thomas High School,Charter,1635,"$1,043,130",$638.00,83.42,83.85,93.27,97.31,90.95
4,Griffin High School,Charter,1468,"$917,500",$625.00,83.35,83.82,93.39,97.14,90.6
5,Wilson High School,Charter,2283,"$1,319,574",$578.00,83.27,83.99,93.87,96.54,90.58
9,Pena High School,Charter,962,"$585,858",$609.00,83.84,84.04,94.59,95.95,90.54


Bottom Performing Schools (By % Overall Passing)

In [14]:
# Bottom Performing Schools (By % Overall Passing)
school_bot5 = pd.DataFrame({
    "School": school_name,
    "School Type": school_type,
    "Total Students": total_students,
    "Total School Budget": total_budget,
    "Per Student Budget": budget_per_student,
    "Average Math Score": avg_math_score,
    "Average Reading Score": avg_reading_score,
    "% Passing Math": passing_math_rate,
    "% Passing Reading": passing_reading_rate,
    "% Overall Passing": overall_passing_rate
})
school_bot5 = school_bot5.nsmallest(5,'% Overall Passing')

In [15]:
# Use Map to format columns
school_bot5["Total Students"] = school_bot5["Total Students"].map("{:,}".format)
school_bot5["Total School Budget"] = school_bot5["Total School Budget"].map("${:,}".format)
school_bot5["Per Student Budget"] = school_bot5["Per Student Budget"].map("${:.2f}".format)
school_bot5["Average Math Score"] = school_bot5["Average Math Score"].map("{:.2f}".format)
school_bot5["Average Reading Score"] = school_bot5["Average Reading Score"].map("{:.2f}".format)
school_bot5["% Passing Math"] = school_bot5["% Passing Math"].map("{:.2f}".format)
school_bot5["% Passing Reading"] = school_bot5["% Passing Reading"].map("{:.2f}".format)
school_bot5["% Overall Passing"] = school_bot5["% Overall Passing"].map("{:.2f}".format)
school_bot5.rename_axis(None, axis = 0)

Unnamed: 0,School,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
11,Rodriguez High School,District,3999,"$2,547,363",$637.00,76.84,80.74,66.37,80.22,52.99
1,Figueroa High School,District,2949,"$1,884,411",$639.00,76.71,81.16,65.99,80.74,53.2
0,Huang High School,District,2917,"$1,910,635",$655.00,76.63,81.18,65.68,81.32,53.51
3,Hernandez High School,District,4635,"$3,022,020",$652.00,77.29,80.93,66.75,80.86,53.53
12,Johnson High School,District,4761,"$3,094,650",$650.00,77.07,80.97,66.06,81.22,53.54


In [16]:
# Math Scores by Grade
school_group = school_data_complete.groupby(["School ID"])
school_name = school_group["school_name"].unique().str.get(0)

grade9 = school_data_complete.loc[school_data_complete["grade"] == "9th", 
                                        ["School ID","school_name","reading_score","math_score"]]
grade9_group = grade9.groupby(["School ID"])
math_avg9th = grade9_group["math_score"].mean()
reading_avg9th = grade9_group["reading_score"].mean()

grade10 = school_data_complete.loc[school_data_complete["grade"] == "10th", 
                                        ["School ID","school_name","reading_score","math_score"]]
grade10_group = grade10.groupby(["School ID"])
math_avg10th = grade10_group["math_score"].mean()
reading_avg10th = grade10_group["reading_score"].mean()

grade11 = school_data_complete.loc[school_data_complete["grade"] == "11th", 
                                        ["School ID","school_name","reading_score","math_score"]]
grade11_group = grade11.groupby(["School ID"])
math_avg11th = grade11_group["math_score"].mean()
reading_avg11th = grade11_group["reading_score"].mean()

grade12 = school_data_complete.loc[school_data_complete["grade"] == "12th", 
                                        ["School ID","school_name","reading_score","math_score"]]
grade12_group = grade12.groupby(["School ID"])
math_avg12th = grade12_group["math_score"].mean()
reading_avg12th = grade12_group["reading_score"].mean()

Math Scores by Grade

In [17]:
# School-Math Summary dataframe
school_math_avg = pd.DataFrame({
    "School": school_name,
    "9th": math_avg9th,
    "10th": math_avg10th,
    "11th": math_avg11th,
    "12th": math_avg12th
})
school_math_avg = school_math_avg.sort_values("School",ascending=True)
school_math_avg.rename_axis(None, axis = 0)

Unnamed: 0,School,9th,10th,11th,12th
7,Bailey High School,77.083676,76.996772,77.515588,76.492218
6,Cabrera High School,83.094697,83.154506,82.76556,83.277487
1,Figueroa High School,76.403037,76.539974,76.884344,77.151369
13,Ford High School,77.361345,77.672316,76.918058,76.179963
4,Griffin High School,82.04401,84.229064,83.842105,83.356164
3,Hernandez High School,77.438495,77.337408,77.136029,77.186567
8,Holden High School,83.787402,83.429825,85.0,82.855422
0,Huang High School,77.027251,75.908735,76.446602,77.225641
12,Johnson High School,77.187857,76.691117,77.491653,76.863248
9,Pena High School,83.625455,83.372,84.328125,84.121547


Reading Scores by Grade

In [18]:
# School-Reading Summary dataframe
school_reading_avg = pd.DataFrame({
    "School": school_name,
    "9th": reading_avg9th,
    "10th": reading_avg10th,
    "11th": reading_avg11th,
    "12th": reading_avg12th
})
school_reading_avg = school_reading_avg.sort_values("School",ascending=True)
school_reading_avg.rename_axis(None, axis = 0)

Unnamed: 0,School,9th,10th,11th,12th
7,Bailey High School,81.303155,80.907183,80.945643,80.912451
6,Cabrera High School,83.676136,84.253219,83.788382,84.287958
1,Figueroa High School,81.198598,81.408912,80.640339,81.384863
13,Ford High School,80.632653,81.262712,80.403642,80.662338
4,Griffin High School,83.369193,83.706897,84.288089,84.013699
3,Hernandez High School,80.86686,80.660147,81.39614,80.857143
8,Holden High School,83.677165,83.324561,83.815534,84.698795
0,Huang High School,81.290284,81.512386,81.417476,80.305983
12,Johnson High School,81.260714,80.773431,80.616027,81.227564
9,Pena High School,83.807273,83.612,84.335938,84.59116


Scores by School Spending

In [19]:
# Create the bins in which Data will be held
# Bins are 0, 584, 629, 644, 675   
bins = [0, 584, 629, 644, 675]

# Create the names for the five bins
group_labels = ["<$584", "$585-629", "630-644", "645-675"]

In [20]:
# Place the data series into a new column inside of the DataFrame
spend_df["Spending Ranges (Per Student)"] = pd.cut(spend_df["Per Student Budget"], bins, labels=group_labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
# Creating a group based off of the bins
spend_group = spend_df.groupby("Spending Ranges (Per Student)")
spend_group[["Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading",
                       "% Overall Passing"]].mean()

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$584,83.455399,83.933814,93.460096,96.610877,90.369459
$585-629,81.899826,83.155286,87.133538,92.718205,81.418596
630-644,78.518855,81.624473,73.484209,84.391793,62.857656
645-675,76.99721,81.027843,66.164813,81.133951,53.526855


Scores by School Size

In [22]:
# Create the bins in which Data will be held
bins = [0, 1000, 2000, 5000]

# Create the names for the five bins
group_labels = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

In [23]:
# Place the data series into a new column inside of the DataFrame
size_df["School Size"] = pd.cut(size_df["Total Students"], bins, labels=group_labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
# Creating a group based off of the bins
size_group = size_df.groupby("School Size")
size_group[["Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading",
                       "% Overall Passing"]].mean()

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.821598,83.929843,93.550225,96.099437,89.883853
Medium (1000-2000),83.374684,83.864438,93.599695,96.79068,90.621535
Large (2000-5000),77.746417,81.344493,69.963361,82.766634,58.286003


Scores by School Type

In [25]:
# Creating group by School Type
type_group = type_df.groupby("School Type")
type_group[["Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading",
                       "% Overall Passing"]].mean()

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.473852,83.896421,93.62083,96.586489,90.432244
District,76.956733,80.966636,66.548453,80.799062,53.672208


Observations:

Charter type schools are outperforming District type schools in all categories. See results of the Scores by School Type and the Top 5 and Bottom 5 performance schools.

The schools with a higher budget per student are being outperformed by the schools with the lower budget per student. Spending more money per student does not guarantee higher performance in math and reading scores or passing.

The large size schools are also underperforming, especially in the % Overall Passing. Bigger is not always better.