# PyCity Schools Analysis

In [228]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path

# File to Load (Remember to Change These)
school_data_to_load = Path("../Resources/schools_complete.csv")
student_data_to_load = Path("../Resources/students_complete.csv")

# Read School and Student Data File and store into Pandas DataFrames
schools_df = pd.read_csv(school_data_to_load)
student_df = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
data_complete = pd.merge(student_df, schools_df, how="outer", on=["school_name", "school_name"])

Huang, Figueroa, Shelton, Hernandez, Griffin, Wilson, Cabrera, Bailey, Holden, Pena, Wright, Rodriguez, Johnson, Ford, Thomas

There are 15 schools - store in a variable

In [229]:
school_count = len(schools_df["school_name"])

data_complete["Student ID"].value_counts()
data_complete.describe()

this shows 39,170 students

length of the Student ID column is 39170   same for all columns

In [230]:
student_count = len(student_df["Student ID"].unique())
print(student_count)

student_name_count = len(student_df["student_name"].unique())
print(student_name_count)

39170
32715


there are fewer unique student names than there are student IDs
I'm sure that some students have the same name, but this is a big difference

In [231]:
data_inner = pd.merge(student_df, schools_df, on=["school_name"])


get enrollment count at each school from the size column

In [232]:
enrollments= schools_df["size"].unique()
print(enrollments)
enrollments_total = enrollments.sum()
print(enrollments_total)



[2917 2949 1761 4635 1468 2283 1858 4976  427  962 1800 3999 4761 2739
 1635]
39170


the sum of the individual sizes of the schools is 39170
student_count = 39170

let's do the same thing for the budget

In [233]:
budgets_all = schools_df["budget"].unique()
print(budgets_all)

budgets_total = budgets_all.sum()
print(budgets_total)


[1910635 1884411 1056600 3022020  917500 1319574 1081356 3124928  248087
  585858 1049400 2547363 3094650 1763916 1043130]
24649428


There are 15 unique budget values.  This is as expected
The total is 24649428 stored in budgets_total

for the average math scores, we'll want to look at the merged datafile

In [234]:
avg_reading_score = data_complete["reading_score"].mean()
print(avg_reading_score)

avg_math_score = data_complete["math_score"].mean()
print(avg_math_score)

81.87784018381414
78.98537145774827


overall average math score stored in avg_math_score
overall average reading score stored in ave_reading_score

Creating a new column for pass or fail math score with 60 and higher being pass and lower being fail
and one for reading also

In [235]:

data_complete["math_pass"] = data_complete["math_score"].apply(lambda x:"pass" if x>=60 else "fail")

data_complete["reading_pass"] = data_complete["reading_score"].apply(lambda x:"pass" if x>=60 else "fail")

math_num_pass = data_complete["math_pass"].value_counts()
reading_num_pass = data_complete["reading_pass"].value_counts()

print(math_num_pass)
print(reading_num_pass)

data_complete.head()

pass    36211
fail     2959
Name: math_pass, dtype: int64
pass    39170
Name: reading_pass, dtype: int64


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,math_pass,reading_pass
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,pass,pass
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,pass,pass
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,pass,pass
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,fail,pass
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,pass,pass


get percents passing
math_percent_pass
reading_percent_pass
overall_percent_pass

In [236]:
math_percent_pass = (math_num_pass / student_count)*100
reading_percent_pass = (reading_num_pass / student_count)*100

def get_status(row):
    if row["math_pass"] == "pass" and row["reading_pass"] == "pass":
        return "pass"
    else:
        return "fail"

data_complete["overall pass"] = data_complete.apply(lambda row: get_status(row), axis=1)

overall_num_pass = data_complete["overall pass"].value_counts()

print(overall_num_pass)

overall_percent_pass = (overall_num_pass / student_count)*100

print(math_percent_pass)
print(reading_percent_pass)
print(overall_percent_pass)


pass    36211
fail     2959
Name: overall pass, dtype: int64
pass    92.445749
fail     7.554251
Name: math_pass, dtype: float64
pass    100.0
Name: reading_pass, dtype: float64
pass    92.445749
fail     7.554251
Name: overall pass, dtype: float64


In [237]:
print("District Summary")
print("There are ", school_count, "schools in the district.")
print("There are ", student_count, "total students.")
print("The total district budget is $", budgets_total, ".")
print("The average math score for the district is ", avg_math_score, ".")
print("The average reading score for the district is ", avg_reading_score, ".")
print("Percentage of students passing math is ", math_percent_pass, "%.")
print("Percentage of students passing reading is ", reading_percent_pass, "%.")
print("Percentage of students passing both math and readig is ", overall_percent_pass, "%.")

District Summary
There are  15 schools in the district.
There are  39170 total students.
The total district budget is $ 24649428 .
The average math score for the district is  78.98537145774827 .
The average reading score for the district is  81.87784018381414 .
Percentage of students passing math is  pass    92.445749
fail     7.554251
Name: math_pass, dtype: float64 %.
Percentage of students passing reading is  pass    100.0
Name: reading_pass, dtype: float64 %.
Percentage of students passing both math and readig is  pass    92.445749
fail     7.554251
Name: overall pass, dtype: float64 %.


Make a dataframe for each individual school

In [238]:
school_name_df = data_complete.set_index("school_name")

Huang_df = school_name_df.loc["Huang High School", :]


In [270]:
grouped_school_totals = data_complete.groupby(["school_name","math_pass"])
math_pass_school_df = pd.DataFrame(grouped_school_totals[["math_pass"]].count())

grouped_school_totals2 = data_complete.groupby(["school_name","reading_pass"])
reading_pass_school_df = pd.DataFrame(grouped_school_totals2[["reading_pass"]].count())

math_pass_school1_df = math_pass_school_df.rename(columns = {"math_pass":"count"})


mpass_lib=[math_pass_school1_df.iloc[1,0],math_pass_school1_df.iloc[2,0],math_pass_school1_df.iloc[4,0],math_pass_school1_df.iloc[6,0],
           math_pass_school1_df.iloc[7,0],math_pass_school1_df.iloc[9,0],math_pass_school1_df.iloc[10,0],math_pass_school1_df.iloc[12,0],
           math_pass_school1_df.iloc[14,0],math_pass_school1_df.iloc[15,0],math_pass_school1_df.iloc[17,0],math_pass_school1_df.iloc[18,0],
           math_pass_school1_df.iloc[19,0],math_pass_school1_df.iloc[20,0],math_pass_school1_df.iloc[21,0]]

math_pass_school1_df


Unnamed: 0_level_0,Unnamed: 1_level_0,count
school_name,math_pass,Unnamed: 2_level_1
Bailey High School,fail,521
Bailey High School,pass,4455
Cabrera High School,pass,1858
Figueroa High School,fail,341
Figueroa High School,pass,2608
Ford High School,fail,293
Ford High School,pass,2446
Griffin High School,pass,1468
Hernandez High School,fail,506
Hernandez High School,pass,4129


In [275]:
pass_merge_df = pd.merge(schools_df, math_pass_school1_df, on= "school_name", how = "inner")
pass_merge_df

Unnamed: 0,School ID,school_name,type,size,budget,count
0,0,Huang High School,District,2917,1910635,325
1,0,Huang High School,District,2917,1910635,2592
2,1,Figueroa High School,District,2949,1884411,341
3,1,Figueroa High School,District,2949,1884411,2608
4,2,Shelton High School,Charter,1761,1056600,1761
5,3,Hernandez High School,District,4635,3022020,506
6,3,Hernandez High School,District,4635,3022020,4129
7,4,Griffin High School,Charter,1468,917500,1468
8,5,Wilson High School,Charter,2283,1319574,2283
9,6,Cabrera High School,Charter,1858,1081356,1858


In [None]:
school_averages_df = pd.DataFrame(grouped_school_totals[["math_score", "reading_score"]].mean())
print("average scores")
school_averages_df["num_pass_math"] = mpass_lib
school_averages_df

average scores


ValueError: Length of values (15) does not match length of index (22)

In [268]:
merge_1_df = pd.merge(schools_df, school_totals_df, on= "school_name", how = "outer")
merge_1_df["% passing math"]=merge_1_df["math_pass"]/merge_1_df["size"]*100
merge_1_df["% passing reading"]=merge_1_df["reading_pass"]/merge_1_df["size"]*100

merge_1_df.head()

Unnamed: 0,School ID,school_name,type,size,budget,math_pass,reading_pass,overall pass,% passing math,% passing reading
0,0,Huang High School,District,2917,1910635,2917,2917,2917,100.0,100.0
1,1,Figueroa High School,District,2949,1884411,2949,2949,2949,100.0,100.0
2,2,Shelton High School,Charter,1761,1056600,1761,1761,1761,100.0,100.0
3,3,Hernandez High School,District,4635,3022020,4635,4635,4635,100.0,100.0
4,4,Griffin High School,Charter,1468,917500,1468,1468,1468,100.0,100.0


In [202]:
merge_2_df = pd.merge(merge_1_df, school_averages_df, on="school_name", how = "outer")
merge_2_df.head()

Unnamed: 0,School ID,school_name,type,size,budget,math_pass,reading_pass,overall pass,% passing math,% passing reading,math_score,reading_score
0,0,Huang High School,District,2917,1910635,2917,2917,2917,100.0,100.0,76.629414,81.182722
1,1,Figueroa High School,District,2949,1884411,2949,2949,2949,100.0,100.0,76.711767,81.15802
2,2,Shelton High School,Charter,1761,1056600,1761,1761,1761,100.0,100.0,83.359455,83.725724
3,3,Hernandez High School,District,4635,3022020,4635,4635,4635,100.0,100.0,77.289752,80.934412
4,4,Griffin High School,Charter,1468,917500,1468,1468,1468,100.0,100.0,83.351499,83.816757


In [203]:
merge_2_df["Per student budget"]=merge_2_df["budget"]/merge_2_df["size"]
merge_2_df.head()

Unnamed: 0,School ID,school_name,type,size,budget,math_pass,reading_pass,overall pass,% passing math,% passing reading,math_score,reading_score,Per student budget
0,0,Huang High School,District,2917,1910635,2917,2917,2917,100.0,100.0,76.629414,81.182722,655.0
1,1,Figueroa High School,District,2949,1884411,2949,2949,2949,100.0,100.0,76.711767,81.15802,639.0
2,2,Shelton High School,Charter,1761,1056600,1761,1761,1761,100.0,100.0,83.359455,83.725724,600.0
3,3,Hernandez High School,District,4635,3022020,4635,4635,4635,100.0,100.0,77.289752,80.934412,652.0
4,4,Griffin High School,Charter,1468,917500,1468,1468,1468,100.0,100.0,83.351499,83.816757,625.0


Now cleaning the dataframe, renaming columnns, organizing, deleting unneeded columns
first - indexing by school name

In [204]:
full_school_df = merge_2_df.set_index("school_name")
del full_school_df["School ID"]
renamed_df = full_school_df.rename(columns = {"type":"School Type", "size":"Total Students", "budget":"Total school budget", "math_score":"Average math score",
                                       "reading_score":"Average reading score"})
organized_df=renamed_df[["School Type","Total Students", "Total school budget", "Per student budget", "Average math score", "Average reading score", "% passing math", "% passing reading"]]
organized_df

Unnamed: 0_level_0,School Type,Total Students,Total school budget,Per student budget,Average math score,Average reading score,% passing math,% passing reading
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,100.0,100.0
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,100.0,100.0
Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,100.0,100.0
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,100.0,100.0
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,100.0,100.0
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,100.0,100.0
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,100.0,100.0
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,100.0,100.0
Holden High School,Charter,427,248087,581.0,83.803279,83.814988,100.0,100.0
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,100.0,100.0


formatting values in this data frame
#

def format_int():
    return "{:,}".format()

def format_per():
    return "{:.1f}%".format()

def format_budg():
    return "${:,}".format()

def format_psbudget():
    return "${.0f}".format()

organized_df.loc[:,"Total Students"] = organized_df.loc[:,"Total Students"].apply(format_int)


In [None]:

organized_df["Total Students"] = organized_df["Total Students"].map("{:,}".format)
organized_df["Total school budget"] = organized_df["Total school budget"].map("${:,}".format)
organized_df["Per student budget"] = organized_df["Per student budget"].map("${:.0f}".format)
organized_df["Average math score"] = organized_df["Average math score"].map("{:.1f}%".format)
organized_df["Average reading score"] = organized_df["Average reading score"].map("{:.1f}%".format)
organized_df["% passing math"] = organized_df["% passing math"].map("{:.1f}%".format)
organized_df["% passing reading"] = organized_df["% passing reading"].map("{:.1f}%".format)
organized_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  organized_df["Total Students"] = organized_df["Total Students"].map("{:,}".format)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  organized_df["Total school budget"] = organized_df["Total school budget"].map("${:,}".format)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  organized_df["Per student bu

Unnamed: 0_level_0,School Type,Total Students,Total school budget,Per student budget,Average math score,Average reading score,% passing math,% passing reading
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Huang High School,District,2917,"$1,910,635",$655,76.6%,81.2%,100.0%,100.0%
Figueroa High School,District,2949,"$1,884,411",$639,76.7%,81.2%,100.0%,100.0%
Shelton High School,Charter,1761,"$1,056,600",$600,83.4%,83.7%,100.0%,100.0%
Hernandez High School,District,4635,"$3,022,020",$652,77.3%,80.9%,100.0%,100.0%
Griffin High School,Charter,1468,"$917,500",$625,83.4%,83.8%,100.0%,100.0%
Wilson High School,Charter,2283,"$1,319,574",$578,83.3%,84.0%,100.0%,100.0%
Cabrera High School,Charter,1858,"$1,081,356",$582,83.1%,84.0%,100.0%,100.0%
Bailey High School,District,4976,"$3,124,928",$628,77.0%,81.0%,100.0%,100.0%
Holden High School,Charter,427,"$248,087",$581,83.8%,83.8%,100.0%,100.0%
Pena High School,Charter,962,"$585,858",$609,83.8%,84.0%,100.0%,100.0%
