In [1]:
import pandas as pd

In [2]:
schools_file = "Resources/schools_complete.csv"
schools_df = pd.read_csv(schools_file)

students_file ="Resources/students_complete.csv"
students_df = pd.read_csv(students_file)

merge_df = pd.merge(students_df, schools_df, how="left", on=["school_name","school_name"])
merge_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [3]:
students_df.columns

Index(['Student ID', 'student_name', 'gender', 'grade', 'school_name',
       'reading_score', 'math_score'],
      dtype='object')

In [163]:
#Calculate the total number of schools
num_schools = merge_df["school_name"].nunique()
#Calculate the total number of students with ","
num_students = merge_df["Student ID"].nunique()
#Calculate the total budget in dollars
dist_budget = schools_df["budget"].sum()
#Calculate the average math score 
avg_score_math = merge_df["math_score"].mean()
#Calculate the average reading score
avg_score_read = merge_df["reading_score"].mean()
#Calculate the percentage of students with a passing math score (70 or greater)
tot_pass_math = merge_df["math_score"][merge_df.loc[:,"math_score"] >= 70].count()
pass_math = (tot_pass_math)/(merge_df["math_score"].count())*100
#Calculate the percentage of students with a passing reading score (70 or greater)
tot_pass_read = merge_df["reading_score"][merge_df.loc[:,"reading_score"] >= 70].count()
pass_read = (tot_pass_read)/(merge_df["reading_score"].count())*100
#Calculate the percentage of students who passed math **and** reading (% Overall Passing)
pass_both = merge_df[(merge_df["math_score"]>= 70) & (merge_df["reading_score"] >= 70)]
pass_both_group = len(pass_both)/num_students*100

In [170]:
#Create chart with info
dist_dict = {"Total Schools": [num_schools],
             "Total Students":[num_students],"Total Budget":[dist_budget],
            "Average Math Score":[avg_score_math],
             "Average Reading Score":[avg_score_read],"% Passing Math":[pass_math],
             "% Passing Reading":[pass_read], "% Overall Passing": [pass_both_group]}
dist_df=pd.DataFrame(data=dist_dict)
dist_df["Total Budget"]=dist_df["Total Budget"].map("${:,.2f}".format)
dist_df["Total Students"]=dist_df["Total Students"].map("{:,}".format)
dist_df
#Total Students should also display with "," 

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,65.172326


In [179]:
#Create an overview table that summarizes key metrics about each school, including:
# School Name
schools_sort_df = schools_df.sort_values("school_name", ascending = True)
sch_group = merge_df.groupby(["school_name"])

# Average Math Score
sch_math = sch_group["math_score"].mean()
# Average Reading Score
sch_read = sch_group["reading_score"].mean()
# % Passing Math
#https://www.knytes.com/Cleaning-School-Data-with-Pandas/
sch_stud_total = sch_group["Student ID"].count()
students_sort_df = students_df.sort_values("school_name", ascending = True)
stu_pass_math = merge_df[merge_df["math_score"] >= 70].groupby(["school_name"])
sch_pass_math = [(i/j)*100 for i,j in zip(stu_pass_math.math_score.count(),sch_stud_total)]
# % Passing Reading
stu_pass_read = merge_df[merge_df["reading_score"] >= 70].groupby(["school_name"])
sch_pass_read = [(i/j)*100 for i,j in zip(stu_pass_read.reading_score.count(),sch_stud_total)]
# Per Student Budget
stu_budget = schools_sort_df["budget"]/schools_sort_df["size"]
# % Overall Passing (The percentage of students that passed math **and** reading.)
#Assist re: groupby function from:
#https://notebooks.githubusercontent.com/view/ipynb?browser=chrome&color_mode=auto&commit=7189be806a9ac33a129b6d8476f07d84946666d0&device=unknown&enc_url=68747470733a2f2f7261772e67697468756275736572636f6e74656e742e636f6d2f65726d69617367656c6179652f70616e6461732d6368616c6c656e67652f373138396265383036613961633333613132396236643834373666303764383439343636363664302f5079436974795363686f6f6c732f2e6970796e625f636865636b706f696e74732f5079436974795363686f6f6c735f737461727465722d636865636b706f696e742e6970796e62&logged_in=false&nwo=ermiasgelaye%2Fpandas-challenge&path=PyCitySchools%2F.ipynb_checkpoints%2FPyCitySchools_starter-checkpoint.ipynb&platform=android&repository_id=269505066&repository_type=Repository&version=96
pass_both_sch = merge_df[(merge_df["math_score"]>= 70) & (merge_df["reading_score"] >= 70)].\
groupby('school_name')['Student ID'].count()/sch_stud_total*100


In [180]:
sch_dict = {"School Name":schools_sort_df["school_name"],"School Type":schools_sort_df["type"], 
            "Total Students":schools_sort_df["size"],
            "Total School Budget":schools_sort_df["budget"], "Per Student Budget":stu_budget}
sch_df = pd.DataFrame(data=sch_dict)
stu_budget = sch_df["Total School Budget"]/sch_df["Total Students"]

In [181]:
sch_dict_2 = {"Total Students":sch_stud_total,"Average Math Score":sch_math, 
              "Average Reading Score":sch_read, 
              "% Passing Math":sch_pass_math, "% Passing Reading":sch_pass_read, 
              "% Overall Passing":pass_both_sch}
sch_df_2 =pd.DataFrame(data=sch_dict_2)

In [184]:
sch_sum_df = pd.merge(sch_df,sch_df_2, how="left", on=["Total Students","Total Students"])
sch_sum_df["Total Students"]=sch_sum_df["Total Students"].map("{:,}".format)
sch_sum_df["Total School Budget"]=sch_sum_df["Total School Budget"].map("${:,.2f}".format)
sch_sum_df["Per Student Budget"]=sch_sum_df["Per Student Budget"].map("${:,.2f}".format)
sch_sum_df

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.680064,81.93328,54.642283
1,Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,91.334769
2,Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,53.204476
3,Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.309602,79.299014,54.289887
4,Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,90.599455
5,Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.752967,80.862999,53.527508
6,Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,92.505855,96.252927,89.227166
7,Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.683922,81.316421,53.513884
8,Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.057551,81.222432,53.539172
9,Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.594595,95.945946,90.540541


In [185]:
#Sort and display the top five performing schools by % overall passing.
top_five = sch_sum_df.sort_values("% Overall Passing", ascending=False)
top_five.head()

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
1,Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,91.334769
12,Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83.418349,83.84893,93.272171,97.308869,90.948012
4,Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,90.599455
13,Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.274201,83.989488,93.867718,96.539641,90.582567
9,Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.594595,95.945946,90.540541


In [186]:
#Sort and display the five worst-performing schools by % overall passing.
bottom_five = sch_sum_df.sort_values("% Overall Passing", ascending=True)
bottom_five.head()

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
10,Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.842711,80.744686,66.366592,80.220055,52.988247
2,Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,53.204476
7,Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.683922,81.316421,53.513884
5,Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.752967,80.862999,53.527508
8,Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.057551,81.222432,53.539172


In [150]:
students_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [198]:
#Create a table that lists the average MathScore for students of each grade level (9th, 10th, 11th, 12th) 
#at each school.
#Create a pandas series for each grade. Hint: use a conditional statement.
#Group each series by school

# g_nine_df=students_df.loc[students_df['grade']== '9th'].groupby('school_name').mean()
# g_ten_df=students_df.loc[students_df['grade']== '10th'].groupby('school_name').mean()
# g_elev_df=students_df.loc[students_df['grade']== '11th'].groupby('school_name').mean()
# g_twel_df=students_df.loc[students_df['grade']== '12th']

nine_df=pd.Series([students_df["grade"]=="9th"], name= "9th Grade")
ten_df=pd.Series([students_df["grade"]=="10th"], name= "10th Grade")
elev_df=pd.Series([students_df["grade"]=="11th"], name= "11th Grade")
twel_df=pd.Series([students_df["grade"]=="12th"], name= "12th Grade")
nine_df
#Combine the series into a dataframe

# pass_by_grade = {"9th Grade": nine_df, "10th Grade": ten_df, "11th Grade": elev_df, "12th Grade":twel_df}
# math_by_grade = pd.DataFrame(pass_by_grade, index=(students_df["school_name"]))
# math_by_grade
#Optional: give the displayed data cleaner formatting

0    0         True
1        False
2        False
3...
Name: 9th Grade, dtype: object

In [None]:
#Combine the series into a dataframe
paas_by_grade = pd.DataFrame(data, columns =["9th Grade","10th Grade","11th Grade","12th Grade"])
#Optional: give the displayed data cleaner formatting

In [None]:
#Create a table that lists the average Reading Score for students of each grade level 
#(9th, 10th, 11th, 12th) at each school.

#Create a pandas series for each grade. Hint: use a conditional statement.
  
#Group each series by school
  
#Combine the series into a dataframe
  
#Optional: give the displayed data cleaner formatting

In [None]:
#Create a table that breaks down school performances based on average Spending Ranges (Per Student). 
#Use 4 reasonable bins to group school spending. Include in the table each of the following:
#Average Math Score
#Average Reading Score
#% Passing Math
#% Passing Reading
#Overall Passing Rate (Average of the above two)

In [None]:
#Create a table that breaks down school performances based on school size. 
#Use 4 reasonable bins to group school size. Include in the table each of the following:
#Average Math Score
#Average Reading Score
#% Passing Math
#% Passing Reading
#Overall Passing Rate (Average of the above two)

In [None]:
#Create a table that breaks down school performances based on school type. 
#Include in the table each of the following:
#Average Math Score
#Average Reading Score
#% Passing Math
#% Passing Reading
#Overall Passing Rate (Average of the above two)