# School District Analysis Notebook

This is not a report but my stream of consiousness while analyzing the data. I will make comments as I go. First lets look at the data.

In [1]:
import os
import pandas as pd

# save the filepaths for the data
school_data_file = os.path.join("Resources", "schools_complete.csv")
students_data_file = os.path.join("Resources", "students_complete.csv")

In [2]:
# Load the school data
schools_df = pd.read_csv(school_data_file)
schools_df

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500
5,5,Wilson High School,Charter,2283,1319574
6,6,Cabrera High School,Charter,1858,1081356
7,7,Bailey High School,District,4976,3124928
8,8,Holden High School,Charter,427,248087
9,9,Pena High School,Charter,962,585858


In [3]:
# Load the student data
students_df = pd.read_csv(students_data_file)

students_df

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84
...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90


In [4]:
students_df.notnull().count()

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

In [5]:
schools_df.count()

School ID      15
school_name    15
type           15
size           15
budget         15
dtype: int64

In [6]:
schools_df.notnull()

Unnamed: 0,School ID,school_name,type,size,budget
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,True
3,True,True,True,True,True
4,True,True,True,True,True
5,True,True,True,True,True
6,True,True,True,True,True
7,True,True,True,True,True
8,True,True,True,True,True
9,True,True,True,True,True


## Initial Inspection Reults:

  - 39,000+ students, their grade years, 15 schools, budgets and test scores and no null/NaN values.
  - Students_df['student_name'][3] has a prefix that should not be there, these are high school kids not Doctor's.  
  - I need to find and clean the names of improper prefixes and suffixes.


In [7]:

def find_prefixes_suffixes(df, column_name):
    """ a function to filter through names looking for prefixes and suffixes to be returned in seperate lists, 
        prefixes first followed by suffixes.
        
    df  =  pd.DataFrame() that will be searched.
    column_name = the specific column in the DataFrame that may have the prefixes and suffixes in the names.
    
    """
    
    fix_names = df[column_name].tolist()
    prefixes = list()
    suffixes = list()
    for index, name in enumerate(fix_names):
        if len(name.split(" ")) >=3:    
            if len(name.split(' ')[0]) <= 4:  # save the prefixes of the names that need fixed
                prefixes.append(name.split(' ')[0]) 
            else:
                pass
            if len(name.split(' ')[-1]) <= 3:    # save the suffixes of the names that need fixed
                suffixes.append(name.split(' ')[-1])
    prefixes = list(set(prefixes)) 
    suffixes = list(set(suffixes))
    return prefixes, suffixes


find_prefixes_suffixes(students_df, 'student_name')

(['Adam',
  'Greg',
  'Gina',
  'Ms.',
  'Cory',
  'Ruth',
  'Dale',
  'Tara',
  'Erin',
  'John',
  'Dawn',
  'Sean',
  'Lori',
  'Tony',
  'Marc',
  'Leah',
  'Mark',
  'Cody',
  'Eric',
  'Emma',
  'Troy',
  'Kari',
  'Dana',
  'Gail',
  'Seth',
  'Ian',
  'Paul',
  'Erik',
  'Mike',
  'Jodi',
  'Ryan',
  'Luke',
  'Jill',
  'Dr.',
  'Lynn',
  'Joe',
  'Toni',
  'Kim',
  'Sara',
  'Gary',
  'Anna',
  'Mary',
  'Lisa',
  'Mrs.',
  'Miss',
  'Judy',
  'Jon',
  'Mr.',
  'Tina',
  'Todd',
  'Omar',
  'Kyle',
  'Carl',
  'Jose',
  'Chad',
  'Amy',
  'Anne',
  'Kara',
  'Noah'],
 ['Li',
  'III',
  'II',
  'DDS',
  'PhD',
  'Day',
  'Jr.',
  'DVM',
  'Lee',
  'Roy',
  'Kim',
  'Cox',
  'IV',
  'MD',
  'V'])

In [8]:
# unwanted prefixes and suffixes amoung the list of names that were saved by the filter.
prefixes_suffixes = [' MD', ' PhD', ' DDS', ' DVM', 'Dr. ', 'Miss ', 'Mr. ', 'Mrs. ', 'Ms. ']

# find and fix the unwanted prefixes and suffixes
for word in prefixes_suffixes:
    students_df['student_name'] = students_df['student_name'].str.replace(word, '')

students_df.head()

  


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [9]:
#checking to make sure there are no more unwanted prefixes or suffixes.
find_prefixes_suffixes(students_df, 'student_name')    

(['Omar',
  'Jon',
  'Ryan',
  'Todd',
  'Mark',
  'Adam',
  'Noah',
  'Cody',
  'Eric',
  'Greg',
  'Cory',
  'Seth',
  'Sean',
  'Erik',
  'Tony',
  'Juan'],
 ['III', 'II', 'Jr.', 'IV', 'V'])

In [10]:
# merge all the data into one dataframe
all_df = students_df.merge(schools_df, on='school_name')
all_df

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


### Create a District Summary

Create a summary table that includes: 
  - Total Students
  - Total Budget
  - Average Test Scores
  - % Passing Subject
  - % Passing Overall

In [11]:
# total number of schools in the district
total_schools = len(all_df['school_name'].unique())
total_schools

15

In [12]:
# Total Budget
len(all_df['budget'].unique())  # making sure there are 15 different numbers being added together
budget = all_df['budget'].unique().sum()
budget

24649428

In [13]:
# Total Students
total_students = all_df['Student ID'].count()
total_students

39170

In [14]:
# average test scores for math
math_mean = all_df['math_score'].mean()    
math_mean

78.98537145774827

In [15]:
# average test scores for reading
reading_mean = all_df['reading_score'].mean()    
reading_mean

81.87784018381414

In [16]:
# total students passing math
pass_math = all_df[all_df['math_score'] >= 70]['Student ID'].count() 
pass_math

29370

In [17]:
# total students passing reading
pass_reading = all_df[all_df['reading_score'] >= 70]['Student ID'].count()    
pass_reading

33610

In [18]:
# total students passing both subjects
pass_math_reading = all_df[(all_df['math_score'] >= 70) & (all_df['reading_score'] >= 70)]['Student ID'].count() 
pass_math_reading

25528

In [19]:
# percent of students passing math
math_passing_percent = pass_math / total_students*100    
math_passing_percent

74.9808526933878

In [20]:
# percent of students passing reading 
reading_passing_percent = pass_reading / total_students*100    
reading_passing_percent

85.80546336482001

In [21]:
# percent of students passing both subjects
pass_both_percent = pass_math_reading/total_students*100    
pass_both_percent

65.17232575950983

In [22]:
#combine each variabl into the district summary dataframe
district_summary_df = pd.DataFrame([{'Total Schools':total_schools, 'Total Budget':budget, 'Total Students': total_students, 'Average Math Score':math_mean, 'Average Reading Score':reading_mean, '% Passing Math':math_passing_percent, '% Passing Reading':reading_passing_percent, '% Overall Passing':pass_both_percent}])
district_summary_df

Unnamed: 0,Total Schools,Total Budget,Total Students,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,24649428,39170,78.985371,81.87784,74.980853,85.805463,65.172326


In [23]:
# save a copy of the df for future analysis, 
# str formatting for reports will prevent math functions from working properly
list_of_summary_dfs = [district_summary_df.copy()]

# format the District Summary DataFrame for reporting
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
district_summary_df['% Passing Math'] = district_summary_df["% Passing Math"].map("{:.1f}%".format)
district_summary_df['% Passing Reading'] = district_summary_df["% Passing Reading"].map("{:.1f}%".format)
district_summary_df['% Overall Passing'] = district_summary_df["% Overall Passing"].map("{:.1f}%".format)
district_summary_df['Average Math Score'] = district_summary_df['Average Math Score'].map("{:.1f}".format)
district_summary_df['Average Reading Score'] = district_summary_df['Average Reading Score'].map("{:.1f}".format)



### District Summary Results:

In [24]:
district_summary_df

Unnamed: 0,Total Schools,Total Budget,Total Students,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,"$24,649,428.00",39170,79.0,81.9,75.0%,85.8%,65.2%


# Summary of each school in the district
Use the same columns as the district summary and add a "Budget Per Student" column.
  - Total Students
  - Total Budget
  - Budget Per Student
  - Average Test Scores
  - % Passing Subject
  - % Passing Overall

In [25]:
# New Dataframe to be the summary of all the school information
school_summary_df = pd.DataFrame()

# School type with the school names as the index
school_summary_df['School Type']= schools_df.set_index(['school_name'])['type']
school_summary_df

Unnamed: 0_level_0,School Type
school_name,Unnamed: 1_level_1
Huang High School,District
Figueroa High School,District
Shelton High School,Charter
Hernandez High School,District
Griffin High School,Charter
Wilson High School,Charter
Cabrera High School,Charter
Bailey High School,District
Holden High School,Charter
Pena High School,Charter


In [26]:
# Total students
school_summary_df['Total Students'] = all_df['school_name'].value_counts()
school_summary_df

Unnamed: 0_level_0,School Type,Total Students
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Huang High School,District,2917
Figueroa High School,District,2949
Shelton High School,Charter,1761
Hernandez High School,District,4635
Griffin High School,Charter,1468
Wilson High School,Charter,2283
Cabrera High School,Charter,1858
Bailey High School,District,4976
Holden High School,Charter,427
Pena High School,Charter,962


In [27]:
# Total Budget
school_summary_df['Budget'] = schools_df.set_index(['school_name'])['budget']
school_summary_df

Unnamed: 0_level_0,School Type,Total Students,Budget
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Huang High School,District,2917,1910635
Figueroa High School,District,2949,1884411
Shelton High School,Charter,1761,1056600
Hernandez High School,District,4635,3022020
Griffin High School,Charter,1468,917500
Wilson High School,Charter,2283,1319574
Cabrera High School,Charter,1858,1081356
Bailey High School,District,4976,3124928
Holden High School,Charter,427,248087
Pena High School,Charter,962,585858


In [28]:
# Total Budget per Student
school_summary_df['Budget Per Student'] = school_summary_df['Budget']/school_summary_df['Total Students']
school_summary_df

Unnamed: 0_level_0,School Type,Total Students,Budget,Budget Per Student
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Huang High School,District,2917,1910635,655.0
Figueroa High School,District,2949,1884411,639.0
Shelton High School,Charter,1761,1056600,600.0
Hernandez High School,District,4635,3022020,652.0
Griffin High School,Charter,1468,917500,625.0
Wilson High School,Charter,2283,1319574,578.0
Cabrera High School,Charter,1858,1081356,582.0
Bailey High School,District,4976,3124928,628.0
Holden High School,Charter,427,248087,581.0
Pena High School,Charter,962,585858,609.0


In [29]:
# Average math and reading scores 
per_school_mean = all_df.groupby(['school_name']).mean()
school_summary_df['Average Math Score'] = per_school_mean['math_score'] 
school_summary_df['Average Reading Score'] = per_school_mean['reading_score'] 
school_summary_df

Unnamed: 0_level_0,School Type,Total Students,Budget,Budget Per Student,Average Math Score,Average Reading Score
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Huang High School,District,2917,1910635,655.0,76.629414,81.182722
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802
Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963
Holden High School,Charter,427,248087,581.0,83.803279,83.814988
Pena High School,Charter,962,585858,609.0,83.839917,84.044699


In [30]:
# % passing math
passing_math = all_df[all_df['math_score'] >= 70].groupby(['school_name']).count()
school_summary_df['% Passing Math'] = (passing_math['math_score']/school_summary_df['Total Students'])*100
school_summary_df

Unnamed: 0_level_0,School Type,Total Students,Budget,Budget Per Student,Average Math Score,Average Reading Score,% Passing Math
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471
Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,93.867121
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064
Holden High School,Charter,427,248087,581.0,83.803279,83.814988,92.505855
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595


In [31]:
# % passing reading
passing_reading = all_df[all_df['reading_score'] >= 70].groupby(['school_name']).count()
school_summary_df['% Passing Reading'] = (passing_reading['reading_score']/school_summary_df['Total Students'])*100
school_summary_df

Unnamed: 0_level_0,School Type,Total Students,Budget,Budget Per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234
Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,93.867121,95.854628
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328
Holden High School,Charter,427,248087,581.0,83.803279,83.814988,92.505855,96.252927
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946


In [32]:
# % overall passing
overall_passing = all_df[(all_df['math_score'] >= 70) & (all_df['reading_score'] >= 70)].groupby(['school_name']).count()
school_summary_df['% Passing Overall'] = (overall_passing['math_score']/school_summary_df['Total Students'])*100
school_summary_df

Unnamed: 0_level_0,School Type,Total Students,Budget,Budget Per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,53.513884
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,53.204476
Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,93.867121,95.854628,89.892107
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999,53.527508
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,90.599455
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641,90.582567
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,91.334769
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328,54.642283
Holden High School,Charter,427,248087,581.0,83.803279,83.814988,92.505855,96.252927,89.227166
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,90.540541


In [33]:
# save a copy of the df for future analysis, str formatting will prevent math functions from working
list_of_summary_dfs.append(school_summary_df.copy())

# index 0: District Summary DF
# index 1: School Summary DF

In [34]:
# Format the school summary table
school_summary_df["Total Students"] = school_summary_df["Total Students"].map("{:,}".format)
school_summary_df["Budget"] = school_summary_df["Budget"].map("${:,.2f}".format)
school_summary_df["Budget Per Student"] = school_summary_df["Budget Per Student"].map("${:,.2f}".format)
school_summary_df['% Passing Math'] = school_summary_df["% Passing Math"].map("{:.1f}%".format)
school_summary_df['% Passing Reading'] = school_summary_df["% Passing Reading"].map("{:.1f}%".format)
school_summary_df['% Passing Overall'] = school_summary_df["% Passing Overall"].map("{:.1f}%".format)
school_summary_df['Average Math Score'] = school_summary_df['Average Math Score'].map("{:.1f}".format)
school_summary_df['Average Reading Score'] = school_summary_df['Average Reading Score'].map("{:.1f}".format)

### School Summary Results:

In [35]:
school_summary_df

Unnamed: 0_level_0,School Type,Total Students,Budget,Budget Per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Huang High School,District,2917,"$1,910,635.00",$655.00,76.6,81.2,65.7%,81.3%,53.5%
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.7,81.2,66.0%,80.7%,53.2%
Shelton High School,Charter,1761,"$1,056,600.00",$600.00,83.4,83.7,93.9%,95.9%,89.9%
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.3,80.9,66.8%,80.9%,53.5%
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.4,83.8,93.4%,97.1%,90.6%
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.3,84.0,93.9%,96.5%,90.6%
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.1,84.0,94.1%,97.0%,91.3%
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.0,81.0,66.7%,81.9%,54.6%
Holden High School,Charter,427,"$248,087.00",$581.00,83.8,83.8,92.5%,96.3%,89.2%
Pena High School,Charter,962,"$585,858.00",$609.00,83.8,84.0,94.6%,95.9%,90.5%


### Top 5 Best Performing Schools:

In [36]:
school_summary_df.sort_values(['% Passing Overall'], ascending=False).head(5)

Unnamed: 0_level_0,School Type,Total Students,Budget,Budget Per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.1,84.0,94.1%,97.0%,91.3%
Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83.4,83.8,93.3%,97.3%,90.9%
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.4,83.8,93.4%,97.1%,90.6%
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.3,84.0,93.9%,96.5%,90.6%
Pena High School,Charter,962,"$585,858.00",$609.00,83.8,84.0,94.6%,95.9%,90.5%


### Top 5 Worst Performing Schools:

In [37]:
# find the top 5 worst performing schools
school_summary_df.sort_values(['% Passing Overall'], ascending=True).head(5)

Unnamed: 0_level_0,School Type,Total Students,Budget,Budget Per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.8,80.7,66.4%,80.2%,53.0%
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.7,81.2,66.0%,80.7%,53.2%
Huang High School,District,2917,"$1,910,635.00",$655.00,76.6,81.2,65.7%,81.3%,53.5%
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.3,80.9,66.8%,80.9%,53.5%
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.1,81.0,66.1%,81.2%,53.5%


The best performing schools have fewer students than the worst performing schools. I wonder if this is related to the student teacher ratio. Number of teachers per school is not in the data set.



# How are the different grade years performing at each school?

#### Average Math Scores:

In [38]:
# find the stats about the average math and reading scores for each grade within each school. 9th, 10th, 11th, 12th.

# average math scores by grade
math_grade_level_df = pd.DataFrame()
math_grade_level_df['9th'] = all_df[all_df['grade']=='9th'].groupby(['school_name']).mean()['math_score'].map("{:.1f}".format)
math_grade_level_df['10th'] = all_df[all_df['grade']=='10th'].groupby(['school_name']).mean()['math_score'].map("{:.1f}".format)
math_grade_level_df['11th'] = all_df[all_df['grade']=='11th'].groupby(['school_name']).mean()['math_score'].map("{:.1f}".format)
math_grade_level_df['12th'] = all_df[all_df['grade']=='12th'].groupby(['school_name']).mean()['math_score'].map("{:.1f}".format)
math_grade_level_df.index.name = None
math_grade_level_df

Unnamed: 0,9th,10th,11th,12th
Bailey High School,77.1,77.0,77.5,76.5
Cabrera High School,83.1,83.2,82.8,83.3
Figueroa High School,76.4,76.5,76.9,77.2
Ford High School,77.4,77.7,76.9,76.2
Griffin High School,82.0,84.2,83.8,83.4
Hernandez High School,77.4,77.3,77.1,77.2
Holden High School,83.8,83.4,85.0,82.9
Huang High School,77.0,75.9,76.4,77.2
Johnson High School,77.2,76.7,77.5,76.9
Pena High School,83.6,83.4,84.3,84.1


#### Percent Passing Math:

In [39]:
math_percent_grade_level_df = pd.DataFrame()
math_percent_grade_level_df['9th'] = ((all_df[(all_df['grade']=='9th') & (all_df['math_score']>70)].groupby(['school_name']).count()['math_score']/(all_df[(all_df['grade']=='9th')].groupby(['school_name'])['math_score'].count()))*100).map("{:.1f}%".format)
math_percent_grade_level_df['10th'] = ((all_df[(all_df['grade']=='10th') & (all_df['math_score']>70)].groupby(['school_name']).count()['math_score']/(all_df[(all_df['grade']=='10th')].groupby(['school_name'])['math_score'].count()))*100).map("{:.1f}%".format)
math_percent_grade_level_df['11th'] = ((all_df[(all_df['grade']=='11th') & (all_df['math_score']>70)].groupby(['school_name']).count()['math_score']/(all_df[(all_df['grade']=='11th')].groupby(['school_name'])['math_score'].count()))*100).map("{:.1f}%".format)
math_percent_grade_level_df['12th'] = ((all_df[(all_df['grade']=='12th') & (all_df['math_score']>70)].groupby(['school_name']).count()['math_score']/(all_df[(all_df['grade']=='12th')].groupby(['school_name'])['math_score'].count()))*100).map("{:.1f}%".format)
math_percent_grade_level_df.index.name = None
math_percent_grade_level_df

Unnamed: 0,9th,10th,11th,12th
Bailey High School,65.1%,64.5%,66.5%,61.9%
Cabrera High School,88.6%,90.1%,88.8%,91.1%
Figueroa High School,62.0%,64.5%,62.6%,66.5%
Ford High School,66.5%,66.9%,65.3%,63.6%
Griffin High School,87.0%,91.1%,90.3%,90.8%
Hernandez High School,65.1%,65.4%,64.2%,64.1%
Holden High School,89.8%,93.0%,89.3%,90.4%
Huang High School,65.5%,60.5%,63.0%,64.3%
Johnson High School,64.5%,63.2%,64.8%,62.6%
Pena High School,90.2%,91.6%,92.6%,92.8%


#### Average Reading Scores:

In [40]:
# average reading scores by grade
reading_grade_level_df = pd.DataFrame()
reading_grade_level_df['9th'] = all_df[all_df['grade']=='9th'].groupby(['school_name']).mean()['reading_score'].map("{:.1f}".format)
reading_grade_level_df['10th'] = all_df[all_df['grade']=='10th'].groupby(['school_name']).mean()['reading_score'].map("{:.1f}".format)
reading_grade_level_df['11th'] = all_df[all_df['grade']=='11th'].groupby(['school_name']).mean()['reading_score'].map("{:.1f}".format)
reading_grade_level_df['12th'] = all_df[all_df['grade']=='12th'].groupby(['school_name']).mean()['reading_score'].map("{:.1f}".format)
reading_grade_level_df.index.name = None
reading_grade_level_df

Unnamed: 0,9th,10th,11th,12th
Bailey High School,81.3,80.9,80.9,80.9
Cabrera High School,83.7,84.3,83.8,84.3
Figueroa High School,81.2,81.4,80.6,81.4
Ford High School,80.6,81.3,80.4,80.7
Griffin High School,83.4,83.7,84.3,84.0
Hernandez High School,80.9,80.7,81.4,80.9
Holden High School,83.7,83.3,83.8,84.7
Huang High School,81.3,81.5,81.4,80.3
Johnson High School,81.3,80.8,80.6,81.2
Pena High School,83.8,83.6,84.3,84.6


#### Percent Passing Reading:

In [41]:
reading_percent_grade_level_df = pd.DataFrame()
reading_percent_grade_level_df['9th'] = ((all_df[(all_df['grade']=='9th') & (all_df['reading_score']>70)].groupby(['school_name']).count()['reading_score']/(all_df[(all_df['grade']=='9th')].groupby(['school_name'])['reading_score'].count()))*100).map("{:.1f}%".format)
reading_percent_grade_level_df['10th'] = ((all_df[(all_df['grade']=='10th') & (all_df['reading_score']>70)].groupby(['school_name']).count()['reading_score']/(all_df[(all_df['grade']=='10th')].groupby(['school_name'])['reading_score'].count()))*100).map("{:.1f}%".format)
reading_percent_grade_level_df['11th'] = ((all_df[(all_df['grade']=='11th') & (all_df['reading_score']>70)].groupby(['school_name']).count()['reading_score']/(all_df[(all_df['grade']=='11th')].groupby(['school_name'])['reading_score'].count()))*100).map("{:.1f}%".format)
reading_percent_grade_level_df['12th'] = ((all_df[(all_df['grade']=='12th') & (all_df['reading_score']>70)].groupby(['school_name']).count()['reading_score']/(all_df[(all_df['grade']=='12th')].groupby(['school_name'])['reading_score'].count()))*100).map("{:.1f}%".format)
reading_percent_grade_level_df.index.name = None
reading_percent_grade_level_df

Unnamed: 0,9th,10th,11th,12th
Bailey High School,79.8%,80.5%,78.4%,78.1%
Cabrera High School,93.9%,95.1%,92.9%,93.5%
Figueroa High School,79.6%,78.1%,76.7%,79.2%
Ford High School,77.0%,77.8%,78.5%,76.8%
Griffin High School,92.7%,93.8%,93.4%,93.8%
Hernandez High School,78.3%,77.9%,79.5%,76.9%
Holden High School,91.3%,92.1%,91.3%,97.6%
Huang High School,79.3%,79.7%,78.2%,77.8%
Johnson High School,79.2%,78.8%,76.4%,78.6%
Pena High School,95.3%,90.4%,92.2%,90.1%


#### Percent Overall Passing:

In [42]:
math_reading_grade_level_df = pd.DataFrame()
math_reading_grade_level_df['9th'] = ((all_df[(all_df['grade']=='9th') & (all_df['math_score']>70) & (all_df['reading_score']>70)].groupby(['school_name']).count()['math_score']/(all_df[(all_df['grade']=='9th')].groupby(['school_name'])['math_score'].count()))*100).map("{:.1f}%".format)
math_reading_grade_level_df['10th'] = ((all_df[(all_df['grade']=='10th') & (all_df['math_score']>70) & (all_df['reading_score']>70)].groupby(['school_name']).count()['math_score']/(all_df[(all_df['grade']=='10th')].groupby(['school_name'])['math_score'].count()))*100).map("{:.1f}%".format)
math_reading_grade_level_df['11th'] = ((all_df[(all_df['grade']=='11th') & (all_df['math_score']>70) & (all_df['reading_score']>70)].groupby(['school_name']).count()['math_score']/(all_df[(all_df['grade']=='11th')].groupby(['school_name'])['math_score'].count()))*100).map("{:.1f}%".format)
math_reading_grade_level_df['12th'] = ((all_df[(all_df['grade']=='12th') & (all_df['math_score']>70) & (all_df['reading_score']>70)].groupby(['school_name']).count()['math_score']/(all_df[(all_df['grade']=='12th')].groupby(['school_name'])['math_score'].count()))*100).map("{:.1f}%".format)
math_reading_grade_level_df.index.name = None
math_reading_grade_level_df

Unnamed: 0,9th,10th,11th,12th
Bailey High School,52.2%,51.7%,52.0%,47.9%
Cabrera High School,83.5%,85.6%,82.2%,85.1%
Figueroa High School,50.0%,49.4%,47.4%,53.3%
Ford High School,52.2%,52.4%,51.0%,48.8%
Griffin High School,80.2%,85.5%,84.5%,84.9%
Hernandez High School,50.4%,50.0%,50.8%,49.4%
Holden High School,81.9%,86.0%,81.6%,88.0%
Huang High School,50.8%,48.8%,50.2%,49.7%
Johnson High School,51.0%,49.6%,48.6%,49.9%
Pena High School,86.5%,83.6%,84.8%,84.0%


# Create a Statistical Table

based on the budget per student for average scores and % passing. stat ranges for budget per student will be as follows:  
  - row1 = min upto 25% Quanrtile
  - row2 = 25% to 50% Quartile
  - row3 = 50% to 75% Quartile
  - row4 = 75% to max

In [43]:
total_students_series = all_df.groupby(['school_name']).count()['size']  
budget_stats = (schools_df.set_index('school_name')['budget'] /total_students_series).describe()
budget_stats

count     15.000000
mean     620.066667
std       28.544368
min      578.000000
25%      591.500000
50%      628.000000
75%      641.500000
max      655.000000
dtype: float64

In [44]:
# create the spending bucket ranges for the per student budgets
spending_buckets = [0, budget_stats['25%'], budget_stats['50%'], budget_stats['75%'], budget_stats['max']]
spending_buckets

[0, 591.5, 628.0, 641.5, 655.0]

In [45]:
# Create Labels for the spending buckets
spending_group_names = [f"<{budget_stats['25%']}", 
               f"{budget_stats['25%']}-{budget_stats['50%']}", 
               f"{budget_stats['50%']}-{budget_stats['75%']}", 
               f">{budget_stats['75%']}" ]

spending_group_names

['<591.5', '591.5-628.0', '628.0-641.5', '>641.5']

In [46]:
#add the spending bucket labels to the school_summary_df saved in the list of numerical data
list_of_summary_dfs[1]["Spending Ranges (Per Student)"] = pd.cut(list_of_summary_dfs[1]['Budget Per Student'], spending_buckets, labels=spending_group_names)
list_of_summary_dfs[1]


Unnamed: 0_level_0,School Type,Total Students,Budget,Budget Per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall,Spending Ranges (Per Student)
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,53.513884,>641.5
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,53.204476,628.0-641.5
Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,93.867121,95.854628,89.892107,591.5-628.0
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999,53.527508,>641.5
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,90.599455,591.5-628.0
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641,90.582567,<591.5
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,91.334769,<591.5
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328,54.642283,591.5-628.0
Holden High School,Charter,427,248087,581.0,83.803279,83.814988,92.505855,96.252927,89.227166,<591.5
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,90.540541,591.5-628.0


In [47]:
spending_summary = list_of_summary_dfs[1].groupby(['Spending Ranges (Per Student)']).mean()
del spending_summary['Total Students'], spending_summary['Budget'], spending_summary['Budget Per Student']
spending_summary

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<591.5,83.455399,83.933814,93.460096,96.610877,90.369459
591.5-628.0,81.899826,83.155286,87.133538,92.718205,81.418596
628.0-641.5,78.990942,81.917212,75.209078,86.089386,65.713578
>641.5,77.023555,80.957446,66.70101,80.675217,53.717613


In [48]:
# save a copy of the df for future analysis, str formatting will prevent math functions from working
list_of_summary_dfs.append(spending_summary.copy())

# Format the spending table for a report
spending_summary['Average Math Score'] = spending_summary['Average Math Score'].map("{:.1f}".format)
spending_summary['Average Reading Score'] = spending_summary['Average Reading Score'].map("{:.1f}".format)
spending_summary['% Passing Math'] = spending_summary['% Passing Math'].map("{:.1f}%".format)
spending_summary['% Passing Reading'] = spending_summary['% Passing Reading'].map("{:.1f}%".format)
spending_summary['% Passing Overall'] = spending_summary['% Passing Overall'].map("{:.1f}%".format)

spending_summary

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<591.5,83.5,83.9,93.5%,96.6%,90.4%
591.5-628.0,81.9,83.2,87.1%,92.7%,81.4%
628.0-641.5,79.0,81.9,75.2%,86.1%,65.7%
>641.5,77.0,81.0,66.7%,80.7%,53.7%


In [49]:
# school_size_summary: small: <= 1k; Medium: 1k - 2k; Large 2k-5k  students
school_stats = list_of_summary_dfs[1]['Total Students'].describe()
school_buckets = [0, school_stats['25%'], school_stats['50%'], school_stats['75%'], school_stats['max']]
school_size = [f'Small: <={school_stats["25%"]:,}', 
               f'Medium: {school_stats["25%"]:,}-{school_stats["50%"]:,}', 
               f'Large: {school_stats["50%"]:,}-{school_stats["75%"]:,}', 
               f"Extra Large: >{school_stats['75%']}" ]


list_of_summary_dfs[1]["School Size"] = pd.cut(list_of_summary_dfs[1]['Total Students'], school_buckets, labels=school_size)
school_size_summary = list_of_summary_dfs[1].groupby('School Size').mean()

In [50]:
del school_size_summary['Budget'], school_size_summary['Total Students'], school_size_summary['Budget Per Student']

# save a copy of the df for future analysis, str formatting will prevent math functions from working
list_of_summary_dfs.append(school_size_summary.copy())

In [51]:
# format spending_summary and school_size_summary

school_size_summary['Average Math Score'] = school_size_summary['Average Math Score'].map("{:.1f}".format)
school_size_summary['Average Reading Score'] = school_size_summary['Average Reading Score'].map("{:.1f}".format)
school_size_summary['% Passing Math'] = school_size_summary['% Passing Math'].map("{:.1f}%".format)
school_size_summary['% Passing Reading'] = school_size_summary['% Passing Reading'].map("{:.1f}%".format)
school_size_summary['% Passing Overall'] = school_size_summary['% Passing Overall'].map("{:.1f}%".format)
school_size_summary

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Small: <=1,698.0",83.6,83.9,93.4%,96.7%,90.3%
"Medium: 1,698.0-2,283.0",83.3,83.9,93.8%,96.5%,90.5%
"Large: 2,283.0-3,474.0",76.8,81.0,66.7%,80.5%,53.7%
Extra Large: >3474.0,77.1,80.9,66.5%,81.1%,53.7%


In [52]:
formatted_summary_dfs.append(spending_summary)
formatted_summary_dfs.append(school_size_summary)

NameError: name 'formatted_summary_dfs' is not defined

In [None]:
school_type_summary = list_of_summary_dfs[1].groupby(['School Type']).mean()
del school_type_summary['Total Students'], school_type_summary['Budget'], school_type_summary['Budget Per Student']
list_of_summary_dfs.append(school_type_summary.copy())

In [None]:
school_type_summary['Average Math Score'] = school_type_summary['Average Math Score'].map("{:.1f}".format)
school_type_summary['Average Reading Score'] = school_type_summary['Average Reading Score'].map("{:.1f}".format)
school_type_summary['% Passing Math'] = school_type_summary['% Passing Math'].map("{:.1f}%".format)
school_type_summary['% Passing Reading'] = school_type_summary['% Passing Reading'].map("{:.1f}%".format)
school_type_summary['% Passing Overall'] = school_type_summary['% Passing Overall'].map("{:.1f}%".format)
school_type_summary

In [None]:
formatted_summary_dfs.append(school_type_summary)