# PyCity Schools Analysis

There is a wealth of information about the schools in this district contained in these files.  The district consists of 15 different high schools that vary in size, and type.  The 15 schools have a combined enrollment of 39,170 students, and a total budget of $24,649,428.   

In [2]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path

# File to Load (Remember to Change These)
school_data_to_load = Path("../Resources/schools_complete.csv")
student_data_to_load = Path("../Resources/students_complete.csv")

# Read School and Student Data File and store into Pandas DataFrames
schools_df = pd.read_csv(school_data_to_load)
student_df = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
data_complete = pd.merge(student_df, schools_df, how="outer", on=["school_name", "school_name"])

Huang, Figueroa, Shelton, Hernandez, Griffin, Wilson, Cabrera, Bailey, Holden, Pena, Wright, Rodriguez, Johnson, Ford, Thomas

There are 15 schools - store in a variable

In [3]:
school_count = len(schools_df["school_name"])
student_count = len(student_df["Student ID"].unique())
print(student_count)

39170


In [4]:
budgets_all = schools_df["budget"].unique()
print(budgets_all)

budgets_total = budgets_all.sum()
print(budgets_total)


[1910635 1884411 1056600 3022020  917500 1319574 1081356 3124928  248087
  585858 1049400 2547363 3094650 1763916 1043130]
24649428


In [5]:
avg_reading_score = data_complete["reading_score"].mean()
print(avg_reading_score)

avg_math_score = data_complete["math_score"].mean()
print(avg_math_score)

81.87784018381414
78.98537145774827


length of the Student ID column is 39170   and this is the same length for all columns

Creating a new column for pass or fail math score with 70 and higher being pass and lower being fail
and one for reading also

In [6]:

data_complete["math_pass"] = data_complete["math_score"].apply(lambda x:"pass" if x>=70 else "fail")

data_complete["reading_pass"] = data_complete["reading_score"].apply(lambda x:"pass" if x>=70 else "fail")

math_num_pass = data_complete["math_pass"].value_counts()
reading_num_pass = data_complete["reading_pass"].value_counts()

get percents passing
math_percent_pass
reading_percent_pass
overall_percent_pass

In [7]:
math_percent_pass = (math_num_pass / student_count)*100
reading_percent_pass = (reading_num_pass / student_count)*100

def get_status(row):
    if row["math_pass"] == "pass" and row["reading_pass"] == "pass":
        return "pass"
    
        

data_complete["overall pass"] = data_complete.apply(lambda row: get_status(row), axis=1)

overall_num_pass = data_complete["overall pass"].value_counts()

overall_percent_pass = (overall_num_pass / student_count)*100


In [14]:
district_summary = pd.DataFrame([{"Total Schools":school_count,"Total Students":student_count,
                                  "Total Budget":budgets_total,"Average Math Score": avg_math_score,
                                  "Average Reading Score":avg_reading_score,"% Passing Math":math_percent_pass,
                                  "% Passing Reading":reading_percent_pass,"% Overall Passing":overall_percent_pass}])

district_summary["Total Students"] = district_summary["Total Students"].map("{:,}".format)
district_summary["Total Budget"] = district_summary["Total Budget"].map("${:,.2f}".format)

district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,math_pass pass 74.980853 fail 25.019147 ...,reading_pass pass 85.805463 fail 14.1945...,"overall pass pass 65.172326 Name: count, dt..."


Make a dataframe for each individual school

In [5]:
school_name_df = data_complete.set_index("school_name")

Huang_df = school_name_df.loc["Huang High School", :]

In [6]:
grouped_school_totals = data_complete.groupby(["school_name","math_pass"])
math_pass_school_df = pd.DataFrame(grouped_school_totals[["math_pass"]].count())

grouped_school_totals2 = data_complete.groupby(["school_name","reading_pass"])
reading_pass_school_df = pd.DataFrame(grouped_school_totals2[["reading_pass"]].count())

math_pass_school1_df = math_pass_school_df.rename(columns = {"math_pass":"math_pass_or_fail"})
read_pass_school1_df = reading_pass_school_df.rename(columns = {"reading_pass":"reading_pass_or_fail"})

In [7]:
pass_merge_df = pd.merge(schools_df, math_pass_school1_df, on= "school_name", how = "inner")
pass_merge2_df = pd.merge(pass_merge_df,read_pass_school1_df, on="school_name",how = "inner")


# drop the duplicate rows with the smaller number - the fails
pass_drop_df=pass_merge2_df.drop_duplicates(subset="school_name", keep="last")

renamed_pass_df = pass_drop_df.rename(columns={"math_pass_or_fail":"Num_pass_math","reading_pass_or_fail":"Num_pass_read"})

renamed_pass_df["% passing math"]=renamed_pass_df["Num_pass_math"]/renamed_pass_df["size"]*100
renamed_pass_df["% passing reading"]=renamed_pass_df["Num_pass_read"]/renamed_pass_df["size"]*100

#add column for per student budget
renamed_pass_df["Per student budget"]=renamed_pass_df["budget"]/renamed_pass_df["size"]

In [8]:
avg_school_scorem = data_complete.groupby(["school_name"])
avg_math_scores_df = pd.DataFrame(avg_school_scorem[["math_score"]].mean())

avg_read_scores_df =pd.DataFrame(avg_school_scorem[["reading_score"]].mean())

merge_scores_df = pd.merge(avg_math_scores_df,avg_read_scores_df,on = "school_name")

average_scores_df = merge_scores_df.rename(columns={"math_score":"Average math score","reading_score":"Average reading score"})

In [9]:
final_merge_df = pd.merge(renamed_pass_df, average_scores_df, on= "school_name", how = "outer")

Now cleaning the dataframe, renaming columnns, organizing, deleting unneeded columns
first - indexing by school name

In [10]:
full_school_df = final_merge_df.set_index("school_name")
del full_school_df["School ID"]
renamed_df = full_school_df.rename(columns = {"type":"School Type", "size":"Total Students", "budget":"Total school budget", "math_score":"Average math score",
                                       "reading_score":"Average reading score"})
organized_df=renamed_df[["School Type","Total Students", "Total school budget", "Per student budget", "Average math score", "Average reading score", "% passing math", "% passing reading"]]

formatting values in this data frame
#

In [11]:

organized_df["Total Students"] = organized_df["Total Students"].map("{:,}".format)
organized_df["Total school budget"] = organized_df["Total school budget"].map("${:,}".format)
organized_df["Per student budget"] = organized_df["Per student budget"].map("${:.0f}".format)
organized_df["Average math score"] = organized_df["Average math score"].map("{:.1f}%".format)
organized_df["Average reading score"] = organized_df["Average reading score"].map("{:.1f}%".format)
organized_df["% passing math"] = organized_df["% passing math"].map("{:.1f}%".format)
organized_df["% passing reading"] = organized_df["% passing reading"].map("{:.1f}%".format)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  organized_df["Total Students"] = organized_df["Total Students"].map("{:,}".format)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  organized_df["Total school budget"] = organized_df["Total school budget"].map("${:,}".format)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  organized_df["Per student bu

In [12]:
organized_df["% overall passing"]=organized_df["% passing math"]
organized_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  organized_df["% overall passing"]=organized_df["% passing math"]


Unnamed: 0_level_0,School Type,Total Students,Total school budget,Per student budget,Average math score,Average reading score,% passing math,% passing reading,% overall passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Huang High School,District,2917,"$1,910,635",$655,76.6%,81.2%,65.7%,81.3%,65.7%
Figueroa High School,District,2949,"$1,884,411",$639,76.7%,81.2%,66.0%,80.7%,66.0%
Shelton High School,Charter,1761,"$1,056,600",$600,83.4%,83.7%,93.9%,95.9%,93.9%
Hernandez High School,District,4635,"$3,022,020",$652,77.3%,80.9%,66.8%,80.9%,66.8%
Griffin High School,Charter,1468,"$917,500",$625,83.4%,83.8%,93.4%,97.1%,93.4%
Wilson High School,Charter,2283,"$1,319,574",$578,83.3%,84.0%,93.9%,96.5%,93.9%
Cabrera High School,Charter,1858,"$1,081,356",$582,83.1%,84.0%,94.1%,97.0%,94.1%
Bailey High School,District,4976,"$3,124,928",$628,77.0%,81.0%,66.7%,81.9%,66.7%
Holden High School,Charter,427,"$248,087",$581,83.8%,83.8%,92.5%,96.3%,92.5%
Pena High School,Charter,962,"$585,858",$609,83.8%,84.0%,94.6%,95.9%,94.6%
