In [1]:
import os
import pandas as pd

# save the filepaths for the data
school_data_file = os.path.join("Resources", "schools_complete.csv")
students_data_file = os.path.join("Resources", "students_complete.csv")

In [2]:
# Load the data
students_df = pd.read_csv(students_data_file)
schools_df = pd.read_csv(school_data_file)
students_df.head(10)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84
5,5,Bryan Miranda,M,9th,Huang High School,94,94
6,6,Sheena Carter,F,11th,Huang High School,82,80
7,7,Nicole Baker,F,12th,Huang High School,96,69
8,8,Michael Roth,M,10th,Huang High School,95,87
9,9,Matthew Greene,M,10th,Huang High School,96,84


In [3]:
# students_df['student_name'][3] has a prefix that should not be there, these are high school kids not Doctor's. I need to find and clean the names of improper prefixes and suffixes.
students_names = students_df['student_name'].tolist()
prefixes = list()
suffixes = list()

# iterate through the names looking for the ones that need fixed.
for index, name in enumerate(students_names):
    if len(name.split(" ")) >=3:    
        if len(name.split(' ')[0]) <= 4:  # save the prefixes of the names that need fixed
            prefixes.append(name.split(' ')[0]) 
        else:
            pass
        if len(name.split(' ')[-1]) <= 3:    # save the suffixes of the names that need fixed
            suffixes.append(name.split(' ')[-1])

# make the prefixes and suffixes lists contain only unique terms
prefixes = list(set(prefixes)) 
suffixes = list(set(suffixes))

# print(suffixes)  # used the terminal print to manually find the unwanted prefixes and suffixes amoung the list of names that were saved by the filter.
prefixes_suffixes = [' MD', ' PhD', ' DDS', ' DVM', 'Dr. ', 'Miss ', 'Mr. ', 'Mrs. ', 'Ms. ']

# find and fix the prefixes and suffixes, do not delete the names or family names that were saved by the filter.
for word in prefixes_suffixes:
    students_df['student_name'] = students_df['student_name'].str.replace(word, '')

students_df.head()



Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [4]:
# merge all the data into one dataframe
all_df = students_df.merge(schools_df, on='school_name')

# Gather data for the whole district and create a summary: , , % passed in subject, % passed both
budget = all_df['budget'].unique().sum()    # total budget
total_schools = len(all_df['school_name'].unique())    # total number of schools in the district
total_students = all_df['Student ID'].count()    # total students
math_mean = all_df['math_score'].mean()    # average test scores for math
reading_mean = all_df['reading_score'].mean()    # average test scores for reading
pass_math = all_df[all_df['math_score'] >= 70]['Student ID'].count()    # total students passing math
pass_reading = all_df[all_df['reading_score'] >= 70]['Student ID'].count()    # total students passing reading
pass_math_reading = all_df[(all_df['math_score'] >= 70) & (all_df['reading_score'] >= 70)]['Student ID'].count() # total students passing both subjects
math_passing_percent = pass_math / total_students*100    # percent of students passing math
reading_passing_percent = pass_reading / total_students*100    # percent of students passing reading 
pass_both_percent = pass_math_reading/total_students*100    # percent of students passing both subjects

district_summary_df = pd.DataFrame([{'Total Schools':total_schools, 'Total Budget':budget, 'Total Students': total_students, 'Average Math Score':math_mean, 'Average Reading Score':reading_mean, '% Passing Math':math_passing_percent, '% Passing Reading':reading_passing_percent, '% Overall Passing':pass_both_percent}])
district_summary_df

Unnamed: 0,Total Schools,Total Budget,Total Students,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,24649428,39170,78.985371,81.87784,74.980853,85.805463,65.172326


In [5]:

# format the District Summary DataFrame
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
district_summary_df['% Passing Math'] = district_summary_df["% Passing Math"].map("{:.1f}%".format)
district_summary_df['% Passing Reading'] = district_summary_df["% Passing Reading"].map("{:.1f}%".format)
district_summary_df['% Overall Passing'] = district_summary_df["% Overall Passing"].map("{:.1f}%".format)
district_summary_df['Average Math Score'] = district_summary_df['Average Math Score'].map("{:.1f}".format)
district_summary_df['Average Reading Score'] = district_summary_df['Average Reading Score'].map("{:.1f}".format)

district_summary_df

Unnamed: 0,Total Schools,Total Budget,Total Students,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,"$24,649,428.00",39170,79.0,81.9,75.0%,85.8%,65.2%
