In [1]:
# Dependencies and Setup
import os
import csv
import math
import statistics
import pandas as pd
import numpy as np

In [2]:
# File to Load
school_data_row = "Resources/schools_complete.csv"
student_data_row = "Resources/students_complete.csv"

In [3]:
# Read School and Student Data File and store into Pandas Data Frames
school_df = pd.read_csv(school_data_row)

# Read School and Student Data File and store into Pandas Data Frames
student_df = pd.read_csv(student_data_row)

# Combine the data into a single dataset
school_data_complete = pd.merge(student_df, school_df, how="left", on=["school_name", "school_name"])

# See as a DataFrame and check it out
data_df = pd.DataFrame(school_data_complete)
data_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [4]:
# Looking for the ones that pass math
pass_math_df = data_df[data_df["math_score"]>=70]
pass_math_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
8,8,Michael Roth,M,10th,Huang High School,95,87,0,District,2917,1910635


In [5]:
pass_math_df["Student ID"].nunique()

29370

In [6]:
percentage_pass_math = (pass_math_df["Student ID"].nunique() / data_df["Student ID"].nunique())*100
percentage_pass_math

74.9808526933878

In [7]:
# Looking for the ones that pass reading
pass_reading_df = data_df[data_df["reading_score"]>=70]
pass_reading_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635


In [8]:
pass_reading_df["Student ID"].nunique()

33610

In [9]:
percentage_pass_reading = (pass_reading_df["Student ID"].nunique() / data_df["Student ID"].nunique())*100
percentage_pass_reading

85.80546336482001

In [10]:
percentage_overall_passing = (percentage_pass_math + percentage_pass_reading)/2
percentage_overall_passing

80.39315802910392

In [11]:
school_df["budget"].sum()

24649428

In [12]:
# Creating a summary DataFrame using the values found
district_summary_df = pd.DataFrame({
    "Total Schools":[data_df["school_name"].nunique()],
    "Total Students" : [data_df["Student ID"].nunique()],
    "Total Budget" : [school_df["budget"].sum()],
    "Average Math Score" : [data_df["math_score"].mean()],
    "Average Reading Score" : [data_df["reading_score"].mean()],
    "% Passing Math":[percentage_pass_math],
    "% Passing Reading": [percentage_pass_reading],
    "% Overall Passing Rate":[percentage_overall_passing]
})

district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,80.393158


In [13]:
# Checking the kind of variables
data_df.dtypes

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
School ID         int64
type             object
size              int64
budget            int64
dtype: object

In [14]:
# Using GroupBy in order to organize the data into fields according to "school" values
grouped_schools_df = data_df.groupby(["school_name","type"])

# In order to be visualized, a data function must be used
grouped_schools_df.count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Student ID,student_name,gender,grade,reading_score,math_score,School ID,size,budget
school_name,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bailey High School,District,4976,4976,4976,4976,4976,4976,4976,4976,4976
Cabrera High School,Charter,1858,1858,1858,1858,1858,1858,1858,1858,1858
Figueroa High School,District,2949,2949,2949,2949,2949,2949,2949,2949,2949
Ford High School,District,2739,2739,2739,2739,2739,2739,2739,2739,2739
Griffin High School,Charter,1468,1468,1468,1468,1468,1468,1468,1468,1468


In [15]:
# The numeric variables can be now calculated per school 
# Total Students
total_students = grouped_schools_df["Student ID"].count()
total_students.head(15)

school_name            type    
Bailey High School     District    4976
Cabrera High School    Charter     1858
Figueroa High School   District    2949
Ford High School       District    2739
Griffin High School    Charter     1468
Hernandez High School  District    4635
Holden High School     Charter      427
Huang High School      District    2917
Johnson High School    District    4761
Pena High School       Charter      962
Rodriguez High School  District    3999
Shelton High School    Charter     1761
Thomas High School     Charter     1635
Wilson High School     Charter     2283
Wright High School     Charter     1800
Name: Student ID, dtype: int64

In [16]:
# Budget per school
budget_per_school = grouped_schools_df["budget"].max()
budget_per_school.head(15)

school_name            type    
Bailey High School     District    3124928
Cabrera High School    Charter     1081356
Figueroa High School   District    1884411
Ford High School       District    1763916
Griffin High School    Charter      917500
Hernandez High School  District    3022020
Holden High School     Charter      248087
Huang High School      District    1910635
Johnson High School    District    3094650
Pena High School       Charter      585858
Rodriguez High School  District    2547363
Shelton High School    Charter     1056600
Thomas High School     Charter     1043130
Wilson High School     Charter     1319574
Wright High School     Charter     1049400
Name: budget, dtype: int64

In [17]:
# Per student budget 
budget_per_student = budget_per_school/total_students
budget_per_student.head(15)

school_name            type    
Bailey High School     District    628.0
Cabrera High School    Charter     582.0
Figueroa High School   District    639.0
Ford High School       District    644.0
Griffin High School    Charter     625.0
Hernandez High School  District    652.0
Holden High School     Charter     581.0
Huang High School      District    655.0
Johnson High School    District    650.0
Pena High School       Charter     609.0
Rodriguez High School  District    637.0
Shelton High School    Charter     600.0
Thomas High School     Charter     638.0
Wilson High School     Charter     578.0
Wright High School     Charter     583.0
dtype: float64

In [18]:
# Average Math Score
school_average_math = grouped_schools_df["math_score"].mean()
school_average_math.head(15)

school_name            type    
Bailey High School     District    77.048432
Cabrera High School    Charter     83.061895
Figueroa High School   District    76.711767
Ford High School       District    77.102592
Griffin High School    Charter     83.351499
Hernandez High School  District    77.289752
Holden High School     Charter     83.803279
Huang High School      District    76.629414
Johnson High School    District    77.072464
Pena High School       Charter     83.839917
Rodriguez High School  District    76.842711
Shelton High School    Charter     83.359455
Thomas High School     Charter     83.418349
Wilson High School     Charter     83.274201
Wright High School     Charter     83.682222
Name: math_score, dtype: float64

In [19]:
# Average Reading Score
school_average_reading = grouped_schools_df["reading_score"].mean()
school_average_reading.head(15)

school_name            type    
Bailey High School     District    81.033963
Cabrera High School    Charter     83.975780
Figueroa High School   District    81.158020
Ford High School       District    80.746258
Griffin High School    Charter     83.816757
Hernandez High School  District    80.934412
Holden High School     Charter     83.814988
Huang High School      District    81.182722
Johnson High School    District    80.966394
Pena High School       Charter     84.044699
Rodriguez High School  District    80.744686
Shelton High School    Charter     83.725724
Thomas High School     Charter     83.848930
Wilson High School     Charter     83.989488
Wright High School     Charter     83.955000
Name: reading_score, dtype: float64

In [20]:
# Looking for the ones that pass math per school
pass_math_per_school = pass_math_df.groupby(["school_name"])

# In order to get the percentage passing math
perc_pass_math = (pass_math_per_school["Student ID"].count() / grouped_schools_df["Student ID"].count())*100
perc_pass_math.head(15)

school_name            type    
Bailey High School     District    66.680064
Cabrera High School    Charter     94.133477
Figueroa High School   District    65.988471
Ford High School       District    68.309602
Griffin High School    Charter     93.392371
Hernandez High School  District    66.752967
Holden High School     Charter     92.505855
Huang High School      District    65.683922
Johnson High School    District    66.057551
Pena High School       Charter     94.594595
Rodriguez High School  District    66.366592
Shelton High School    Charter     93.867121
Thomas High School     Charter     93.272171
Wilson High School     Charter     93.867718
Wright High School     Charter     93.333333
Name: Student ID, dtype: float64

In [21]:
# Looking for the ones that pass reading per school
pass_reading_per_school = pass_reading_df.groupby(["school_name"])

# In order to get the percentage passing reading
perc_pass_reading = (pass_reading_per_school["Student ID"].count()/grouped_schools_df["Student ID"].count())*100
perc_pass_reading.head(15)

school_name            type    
Bailey High School     District    81.933280
Cabrera High School    Charter     97.039828
Figueroa High School   District    80.739234
Ford High School       District    79.299014
Griffin High School    Charter     97.138965
Hernandez High School  District    80.862999
Holden High School     Charter     96.252927
Huang High School      District    81.316421
Johnson High School    District    81.222432
Pena High School       Charter     95.945946
Rodriguez High School  District    80.220055
Shelton High School    Charter     95.854628
Thomas High School     Charter     97.308869
Wilson High School     Charter     96.539641
Wright High School     Charter     96.611111
Name: Student ID, dtype: float64

In [22]:
# Overall Passing Rate (Average of math and reading)
perc_overall_passing = (perc_pass_math+perc_pass_reading)/2
perc_overall_passing.head(15)

school_name            type    
Bailey High School     District    74.306672
Cabrera High School    Charter     95.586652
Figueroa High School   District    73.363852
Ford High School       District    73.804308
Griffin High School    Charter     95.265668
Hernandez High School  District    73.807983
Holden High School     Charter     94.379391
Huang High School      District    73.500171
Johnson High School    District    73.639992
Pena High School       Charter     95.270270
Rodriguez High School  District    73.293323
Shelton High School    Charter     94.860875
Thomas High School     Charter     95.290520
Wilson High School     Charter     95.203679
Wright High School     Charter     94.972222
Name: Student ID, dtype: float64

In [23]:
school_summary = pd.DataFrame({"Total Students" : total_students, 
                               "Total School Budget" : budget_per_school,
                               "Per Student Budget" : budget_per_student,
                               "Average Math Score" : school_average_math,
                               "Average Reading Score" : school_average_reading,
                               "% Passing Math" : perc_pass_math,
                               "% Passing Reading" : perc_pass_reading,
                               "Overall Passing Rate" : perc_overall_passing})

school_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
school_name,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328,74.306672
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,95.586652
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,73.363852
Ford High School,District,2739,1763916,644.0,77.102592,80.746258,68.309602,79.299014,73.804308
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,95.265668
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999,73.807983
Holden High School,Charter,427,248087,581.0,83.803279,83.814988,92.505855,96.252927,94.379391
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,66.057551,81.222432,73.639992
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,95.27027


In [None]:
# School Summary
# Create an overview table that summarizes key metrics about each school, including:
# School Name
# School Type - DONE
# Total Students - DONE
# Total School Budget - DONE
# Per Student Budget - DONE
# Average Math Score - DONE
# Average Reading Score - DONE
# % Passing Math - DONE
# % Passing Reading - DONE
# Overall Passing Rate (Average of the above two)- DONE