In [1]:
import pandas as pd
import numpy as np

In [2]:
# Loading the data files
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

In [3]:
# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv(school_data_to_load, encoding = "ISO-8859-1")
student_data = pd.read_csv(student_data_to_load, encoding = "ISO-8859-1")

In [4]:
school_data.head()  # displaying the 1st 5 rows of the school data

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [5]:
school_cnt = len(school_data["school_name"].unique()) #counting the # of schools
student_cnt = school_data["size"].sum()               #counting total students
Total_Budget = school_data["budget"].sum()            #totalling the districts budget

#displaying the variable counts
print(school_cnt)
print(student_cnt)
print(Total_Budget)

15
39170
24649428


In [6]:
avg_math = student_data['math_score'].mean()
print(avg_math)

78.98537145774827


In [7]:
avg_reading = student_data['reading_score'].mean()   #calculating the average reading score
print(avg_reading)

81.87784018381414


In [8]:
#identifying students that passed the reading test
student_data['reading_result'] = np.where(student_data['reading_score'] >=60, 1, 0)  

#identifying students that passed the math test
student_data['math_result'] = np.where(student_data['math_score'] >=60, 1, 0)    

student_data.head()            

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,reading_result,math_result
0,0,Paul Bradley,M,9th,Huang High School,66,79,1,1
1,1,Victor Smith,M,12th,Huang High School,94,61,1,1
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,1,1
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,1,0
4,4,Bonnie Ray,F,9th,Huang High School,97,84,1,1


In [9]:
#calculating the % of students passing math
avg_math_total = (student_data["math_result"].mean())

#calculating the % of students passing reading
avg_reading_total = (student_data["reading_result"].mean())

#calculating the overall passing rate of both tests
avg_results = (avg_math_total + avg_reading_total)/2

In [11]:
#creating bins for school size
bins = [0, 1500, 3000, 5000]
group_labels ={'small', 'medium', 'large'}
pd.cut(school_data['size'], bins, labels=group_labels).head()
school_data['school_size'] = pd.cut(school_data["size"], bins, labels=group_labels)
print(school_data)

    School ID            school_name      type  size   budget school_size
0           0      Huang High School  District  2917  1910635      medium
1           1   Figueroa High School  District  2949  1884411      medium
2           2    Shelton High School   Charter  1761  1056600      medium
3           3  Hernandez High School  District  4635  3022020       large
4           4    Griffin High School   Charter  1468   917500       small
5           5     Wilson High School   Charter  2283  1319574      medium
6           6    Cabrera High School   Charter  1858  1081356      medium
7           7     Bailey High School  District  4976  3124928       large
8           8     Holden High School   Charter   427   248087       small
9           9       Pena High School   Charter   962   585858       small
10         10     Wright High School   Charter  1800  1049400      medium
11         11  Rodriguez High School  District  3999  2547363       large
12         12    Johnson High School  

In [12]:
#calculating and adding the budget / student metric
school_data['budget_size'] = school_data['budget']/school_data['size']
#display the 1st 5 rows of the school_data
school_data.head()

Unnamed: 0,School ID,school_name,type,size,budget,school_size,budget_size
0,0,Huang High School,District,2917,1910635,medium,655.0
1,1,Figueroa High School,District,2949,1884411,medium,639.0
2,2,Shelton High School,Charter,1761,1056600,medium,600.0
3,3,Hernandez High School,District,4635,3022020,large,652.0
4,4,Griffin High School,Charter,1468,917500,small,625.0


In [13]:
#creating bins for budget / student 
bins = [0, 600, 625, 650, 700]
group_labels ={'< 600', '600-625', '625-650', '> 675'}
pd.cut(school_data['budget_size'], bins, labels=group_labels).head()
school_data['budget_category'] = pd.cut(school_data["budget_size"], bins, labels=group_labels)
school_data.head()

Unnamed: 0,School ID,school_name,type,size,budget,school_size,budget_size,budget_category
0,0,Huang High School,District,2917,1910635,medium,655.0,625-650
1,1,Figueroa High School,District,2949,1884411,medium,639.0,> 675
2,2,Shelton High School,Charter,1761,1056600,medium,600.0,< 600
3,3,Hernandez High School,District,4635,3022020,large,652.0,625-650
4,4,Griffin High School,Charter,1468,917500,small,625.0,600-625


In [14]:
school_data_indexed = school_data.set_index("school_name")
school_data_indexed.head()

Unnamed: 0_level_0,School ID,type,size,budget,school_size,budget_size,budget_category
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Huang High School,0,District,2917,1910635,medium,655.0,625-650
Figueroa High School,1,District,2949,1884411,medium,639.0,> 675
Shelton High School,2,Charter,1761,1056600,medium,600.0,< 600
Hernandez High School,3,District,4635,3022020,large,652.0,625-650
Griffin High School,4,Charter,1468,917500,small,625.0,600-625


In [15]:
school_group = student_data.groupby("school_name")
avg_reading_school = (school_group["reading_score"].mean())
avg_math_school = (school_group["math_score"].mean())
avg_score_school = (avg_reading_school + avg_math_school) / 2
avg_reading_pass = (school_group["reading_result"].mean())
avg_math_pass = (school_group["math_result"].mean())
avg_pass = (avg_reading_pass + avg_math_pass) / 2
school_sum = pd.DataFrame(
            {"Avg Reading Score" : avg_reading_school,
             "Avg Math Score" : avg_math_school,
             "Reading pass rate" : avg_reading_pass,
             "Math pass rate" : avg_math_pass,
             "Avg Score" : avg_score_school,
             "Pass rate" : avg_pass}
)
school_sum.head()


Unnamed: 0_level_0,Avg Reading Score,Avg Math Score,Reading pass rate,Math pass rate,Avg Score,Pass rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bailey High School,81.033963,77.048432,1,0.895297,79.041198,0.947649
Cabrera High School,83.97578,83.061895,1,1.0,83.518837,1.0
Figueroa High School,81.15802,76.711767,1,0.884368,78.934893,0.942184
Ford High School,80.746258,77.102592,1,0.893027,78.924425,0.946513
Griffin High School,83.816757,83.351499,1,1.0,83.584128,1.0


In [16]:
school_data_final = pd.merge(school_data, school_sum, how="left", on=["school_name", "school_name"])
school_data_final.head()

Unnamed: 0,School ID,school_name,type,size,budget,school_size,budget_size,budget_category,Avg Reading Score,Avg Math Score,Reading pass rate,Math pass rate,Avg Score,Pass rate
0,0,Huang High School,District,2917,1910635,medium,655.0,625-650,81.182722,76.629414,1,0.888584,78.906068,0.944292
1,1,Figueroa High School,District,2949,1884411,medium,639.0,> 675,81.15802,76.711767,1,0.884368,78.934893,0.942184
2,2,Shelton High School,Charter,1761,1056600,medium,600.0,< 600,83.725724,83.359455,1,1.0,83.542589,1.0
3,3,Hernandez High School,District,4635,3022020,large,652.0,625-650,80.934412,77.289752,1,0.890831,79.112082,0.945415
4,4,Griffin High School,Charter,1468,917500,small,625.0,600-625,83.816757,83.351499,1,1.0,83.584128,1.0


In [18]:
data = {'Metric':['# of Schools', '# of Students', 'Total Budget',
                  'Avg Math Score','Avg Reading Score', '% Passing Math',
                  '% Passing Reading', 'Overall Pass Rate'],
        'Value':[[school_cnt], [student_cnt], [Total_Budget], 
                 [avg_math], [avg_reading], [avg_math_total],
                 [avg_reading_total], [avg_results]
                ]}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Metric,Value
0,# of Schools,[15]
1,# of Students,[39170]
2,Total Budget,[24649428]
3,Avg Math Score,[78.98537145774827]
4,Avg Reading Score,[81.87784018381414]


In [19]:
school_grade_group = student_data.groupby(["school_name", 'grade'])
avg_reading_grade = (school_grade_group["reading_score"].mean())
avg_math_grade = (school_grade_group["math_score"].mean())
school_sum1 = pd.DataFrame(
            {"Avg Reading Score" : avg_reading_grade,
             "Avg Math Score" : avg_math_grade}
)
school_sum1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Avg Reading Score,Avg Math Score
school_name,grade,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,10th,80.907183,76.996772
Bailey High School,11th,80.945643,77.515588
Bailey High School,12th,80.912451,76.492218
Bailey High School,9th,81.303155,77.083676
Cabrera High School,10th,84.253219,83.154506


In [20]:
school_sum_reading = pd.DataFrame(
            {"Avg Reading Score" : avg_reading_grade})
school_sum_reading.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Avg Reading Score
school_name,grade,Unnamed: 2_level_1
Bailey High School,10th,80.907183
Bailey High School,11th,80.945643
Bailey High School,12th,80.912451
Bailey High School,9th,81.303155
Cabrera High School,10th,84.253219


In [21]:
school_sum_math = pd.DataFrame(
            {"Avg math Score" : avg_math_grade})
school_sum_math.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Avg math Score
school_name,grade,Unnamed: 2_level_1
Bailey High School,10th,76.996772
Bailey High School,11th,77.515588
Bailey High School,12th,76.492218
Bailey High School,9th,77.083676
Cabrera High School,10th,83.154506


In [22]:
#creating a grouping by school size
size_group = school_data_final.groupby("school_size")


 #calculating avg reading score for the group
avg_reading_size = (size_group["Avg Reading Score"].mean())

 #calculating avg math score for the group
avg_math_size = (size_group["Avg Math Score"].mean())


 #calculating avg score for both tests for the group
avg_total_size = (avg_reading_size + avg_math_size) / 2

 #calculating Passing rate for reading for the group
avg_reading_pass_size = (size_group["Reading pass rate"].mean())

 #calculating Passing rate for math for the group
avg_math_pass_size = (size_group["Math pass rate"].mean())

#creating the dataframe for the group
size_df = pd.DataFrame(
            {"Avg Reading Score" : avg_reading_size,
             "Avg Math Score" : avg_math_size,
             "Avg test Scores" : avg_total_size,
             "% Pass-Reading" : avg_reading_pass_size,
             "% Pass-Math" : avg_math_pass_size}
)

#displaying the data by school size
size_df.head()


Unnamed: 0_level_0,Avg Reading Score,Avg Math Score,Avg test Scores,% Pass-Reading,% Pass-Math
school_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
small,83.892148,83.664898,83.778523,1,1.0
medium,82.82274,80.904987,81.863863,1,0.958247
large,80.919864,77.06334,78.991602,1,0.890857


In [23]:
#creating a grouping by budget / student size
budget_group = school_data_final.groupby("budget_category")

 #calculating avg reading score for the group
avg_reading_budget = (budget_group["Avg Reading Score"].mean())  

 #calculating avg math score for the group
avg_math_budget = (budget_group["Avg Math Score"].mean())

 #calculating avg score for both tests for the group
avg_total_budget = (avg_reading_budget + avg_math_budget) / 2

 #calculating Passing rate for reading for the group
avg_reading_pass_budget = (budget_group["Reading pass rate"].mean())

 #calculating Passing rate for math for the group
avg_math_pass_budget = (budget_group["Math pass rate"].mean())

#creating the dataframe for the group
budget_df = pd.DataFrame(
            {"Avg Reading Score" : avg_reading_budget,
             "Avg Math Score" : avg_math_budget,
             "Avg test Scores" : avg_total_budget,
             "% Pass-Reading" : avg_reading_pass_budget,
             "% Pass-Math" : avg_math_pass_budget}
)

#displaying the data by budget / student group
budget_df.head()


Unnamed: 0_level_0,Avg Reading Score,Avg Math Score,Avg test Scores,% Pass-Reading,% Pass-Math
budget_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
< 600,83.892196,83.43621,83.664203,1,1.0
600-625,83.930728,83.595708,83.763218,1,1.0
> 675,81.416375,78.032719,79.724547,1,0.908332
625-650,81.058567,76.959583,79.009075,1,0.889707


In [28]:
#creating a grouping by type of school
type_group = school_data_final.groupby("type")

 #calculating avg reading score for the group
avg_reading_type = (type_group["Avg Reading Score"].mean())

 #calculating avg math score for the group
avg_math_type = (type_group["Avg Math Score"].mean())


 #calculating avg score for both tests for the group
avg_total_type = (avg_reading_type + avg_math_type) / 2


 #calculating Passing rate for reading for the group
avg_reading_pass_type = (type_group["Reading pass rate"].mean())

 #calculating Passing rate for math for the group
avg_math_pass_type = (type_group["Math pass rate"].mean())

#creating the dataframe for the group
type_group = pd.DataFrame(
            {"Avg Reading Score" : avg_reading_type,
             "Avg Math Score" : avg_math_type,
             "Avg test Scores" : avg_total_type,
             "% Pass-Reading" : avg_reading_pass_type,
             "% Pass-Math" : avg_math_pass_type}
)
#displaying the data by school type
type_group.head()

Unnamed: 0_level_0,Avg Reading Score,Avg Math Score,Avg test Scores,% Pass-Reading,% Pass-Math
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.896421,83.473852,83.685136,1,1.0
District,80.966636,76.956733,78.961685,1,0.889915


In [25]:
# sorting the dataframe rows to display the schools with the lowest overall test scores
Bottom_schools = school_data_final.sort_values(by='Avg Score')
# displaying the 5 schools with the lowest test scores
Bottom_schools.head()

Unnamed: 0,School ID,school_name,type,size,budget,school_size,budget_size,budget_category,Avg Reading Score,Avg Math Score,Reading pass rate,Math pass rate,Avg Score,Pass rate
11,11,Rodriguez High School,District,3999,2547363,large,637.0,> 675,80.744686,76.842711,1,0.885471,78.793698,0.942736
0,0,Huang High School,District,2917,1910635,medium,655.0,625-650,81.182722,76.629414,1,0.888584,78.906068,0.944292
13,13,Ford High School,District,2739,1763916,medium,644.0,> 675,80.746258,77.102592,1,0.893027,78.924425,0.946513
1,1,Figueroa High School,District,2949,1884411,medium,639.0,> 675,81.15802,76.711767,1,0.884368,78.934893,0.942184
12,12,Johnson High School,District,4761,3094650,large,650.0,> 675,80.966394,77.072464,1,0.891829,79.019429,0.945915


In [26]:
# sorting the dataframe rows to display the schools with the highest overall test scores
Top_schools = school_data_final.sort_values(by='Avg Score', ascending = False)
# displaying the 5 schools with the highest test scores
Top_schools.head()

Unnamed: 0,School ID,school_name,type,size,budget,school_size,budget_size,budget_category,Avg Reading Score,Avg Math Score,Reading pass rate,Math pass rate,Avg Score,Pass rate
9,9,Pena High School,Charter,962,585858,small,609.0,600-625,84.044699,83.839917,1,1.0,83.942308,1.0
10,10,Wright High School,Charter,1800,1049400,medium,583.0,< 600,83.955,83.682222,1,1.0,83.818611,1.0
8,8,Holden High School,Charter,427,248087,small,581.0,< 600,83.814988,83.803279,1,1.0,83.809133,1.0
14,14,Thomas High School,Charter,1635,1043130,medium,638.0,> 675,83.84893,83.418349,1,1.0,83.633639,1.0
5,5,Wilson High School,Charter,2283,1319574,medium,578.0,< 600,83.989488,83.274201,1,1.0,83.631844,1.0


In [27]:
Top_schools.style.format({
    'avg Reading Score1': '{:,.2f}'.format,
    'avg Math Score': '{:,.2f}'.format,
    'Reading pass rate': '{:,.2%}'.format
})

Top_schools.head()

Unnamed: 0,School ID,school_name,type,size,budget,school_size,budget_size,budget_category,Avg Reading Score,Avg Math Score,Reading pass rate,Math pass rate,Avg Score,Pass rate
9,9,Pena High School,Charter,962,585858,small,609.0,600-625,84.044699,83.839917,1,1.0,83.942308,1.0
10,10,Wright High School,Charter,1800,1049400,medium,583.0,< 600,83.955,83.682222,1,1.0,83.818611,1.0
8,8,Holden High School,Charter,427,248087,small,581.0,< 600,83.814988,83.803279,1,1.0,83.809133,1.0
14,14,Thomas High School,Charter,1635,1043130,medium,638.0,> 675,83.84893,83.418349,1,1.0,83.633639,1.0
5,5,Wilson High School,Charter,2283,1319574,medium,578.0,< 600,83.989488,83.274201,1,1.0,83.631844,1.0
