In [1]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path

# File to Load (Remember to Change These)
school_data_to_load = Path("Resources/schools_complete.csv")
student_data_to_load = Path("Resources/students_complete.csv")

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete.head(10000)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,Patrick Campbell,M,12th,Hernandez High School,65,83,3,District,4635,3022020
9996,9996,Kristen Barrera,F,11th,Hernandez High School,69,73,3,District,4635,3022020
9997,9997,Eric Hill,M,11th,Hernandez High School,72,85,3,District,4635,3022020
9998,9998,Steven Fox,M,9th,Hernandez High School,85,67,3,District,4635,3022020


In [25]:
school_data_complete["type"].unique()

'District'

In [2]:
# Total number of unique schools
school_count = school_data_complete["school_name"].nunique()
school_count

15

In [3]:
# Total Students
student_count = school_data_complete["student_name"].count()
student_count

39170

In [4]:
# Since the number of budgets is the same as the number of unique schools we can sum the unique values. 
# We wouldn't be able to do this if some schools had the same budget
school_data_complete["budget"].nunique() == school_data_complete["school_name"].nunique()

True

In [5]:
# Total budget
total_budget = school_data_complete["budget"].unique().sum()
total_budget

24649428

In [6]:
# Average math score
average_math_score = school_data_complete["math_score"].mean()
average_math_score

78.98537145774827

In [7]:
# Average reading score
average_reading_score = school_data_complete["reading_score"].mean()
average_reading_score

81.87784018381414

In [8]:
# Percentage of students who passed math
passing_math_students = ((school_data_complete[school_data_complete["math_score"] >= 70].count()["student_name"]) /float(school_data_complete["student_name"].count())) * 100
passing_math_students



# Use the following to calculate the percentage of students who passed math (math scores greather than or equal to 70)
# passing_math_count = school_data_complete[(school_data_complete["math_score"] >= 70)].count()["student_name"]
# passing_math_percentage = passing_math_count / float(student_count) * 100
# passing_math_percentage

74.9808526933878

In [9]:
# Percentage of students who passed reading
passing_reading_students = ((school_data_complete[school_data_complete["reading_score"] >= 70].count()["student_name"])/school_data_complete["student_name"].count()) * 100
passing_reading_students


passing_reading_students = ((school_data_complete[school_data_complete["reading_score"] >= 70].count()["student_name"]) /float(school_data_complete["student_name"].count())) * 100
passing_reading_students

85.80546336482001

In [13]:
# Percentage of students who passed both math and reading (my failed attempt, ask in tutoring)

# All students who passed math
#math_df = school_data_complete[school_data_complete["math_score"] >= 70]["student_name"]

# All students who passed reading 
#reading_df = school_data_complete[school_data_complete["reading_score"] >= 70]["student_name"]
#passing_both_df = pd.merge(math_df, reading_df, on='student_name')
#passing_both_df.nunique()/(school_data_complete["student_name"].count()) * 100

In [11]:
# Percentage of students who passed both math and reading
passing_math_reading_count = school_data_complete[
    (school_data_complete["math_score"] >= 70) & (school_data_complete["reading_score"] >= 70)
].count()["student_name"]

overall_passing_rate = passing_math_reading_count /  float(student_count) * 100
overall_passing_rate

65.17232575950983

In [27]:
# Create a high-level snapshot of the district's key metrics in a DataFrame
district_summary = pd.DataFrame({
    "Total Schools": [school_count],
    "Total Students": [student_count],
    "Total Budget": [total_budget],
    "Average Math Score": [average_math_score],
    "Average Reading Score": [average_reading_score],
    "% passing math": [passing_math_students],
    "% passing reading": [passing_reading_students],
    "% overall passing": [overall_passing_rate],
})

# Formatting
district_summary["Total Students"] = district_summary["Total Students"].map("{:,}".format)
district_summary["Total Budget"] = district_summary["Total Budget"].map("${:,.2f}".format)


for column in district_summary[["% passing math", "% passing reading", "% overall passing"]]:
    district_summary[column] = district_summary[column].map("{:,.2f}%".format)

# Display the DataFrame
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% passing math,% passing reading,% overall passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.98%,85.81%,65.17%


In [55]:
def school_stats(school_df, summary_df):
    school_name = school_df["school_name"][1]
    print(school_name)
    school_type = school_df["type"][1]
    
    student_count = school_df["student_name"].count()
    
    total_budget = 0 #school_df["budget"].nunique() coming out as an array, figure this out
    
    per_student_budget = 0 #total_budget/student_count coming out as an array, figure this out
    
    average_math_score = school_df["math_score"].mean()
    
    average_reading_score = school_df["reading_score"].mean()
    
    passing_math_students = ((school_df[school_df["math_score"] >= 70].count()["student_name"]) /float(school_df["student_name"].count())) * 100
    
    passing_reading_students = ((school_df[school_df["reading_score"] >= 70].count()["student_name"]) /float(school_df["student_name"].count())) * 100
    
    passing_math_reading_count = school_df[
    (school_df["math_score"] >= 70) & (school_df["reading_score"] >= 70)].count()["student_name"]

    overall_passing_rate = passing_math_reading_count /  float(student_count) * 100 
    
    summary_row = {"School Name": school_name, "School Type": school_type, "Total Students": student_count, "Total Budget": total_budget,\
                  "Per Student Budget": per_student_budget, "Average Math Score": average_math_score, "Average Reading Score": average_reading_score,\
                  "% passing math": passing_math_students, "% passing reading": passing_reading_students, "% overall passing": overall_passing_rate}
    
    #summary_df = summary_df.append(summary_row, ignore_index=True)
    
    summary_df = pd.concat([summary_df, pd.DataFrame.from_records([summary_row])])
    
    return summary_row



#where im at rn: function isnt appending the summary row to the summary dataframe

In [51]:
stats_by_school_df = pd.DataFrame({
    "School Name": [],
    "School Type": [],
    "Total Students": [],
    "Total Budget": [],
    "Per Student Budget": [],
    "Average Math Score": [],
    "Average Reading Score": [],
    "% passing math": [],
    "% passing reading": [],
    "% overall passing": [],
})

stats_by_school_df

Unnamed: 0,School Name,School Type,Total Students,Total Budget,Per Student Budget,Average Math Score,Average Reading Score,% passing math,% passing reading,% overall passing


In [52]:
school_stats(school_data_complete[school_data_complete["school_name"] == "Huang High School"], stats_by_school_df)

Huang High School


{'School Name': 'Huang High School',
 'School Type': 'District',
 'Total Students': 2917,
 'Total Budget': 0,
 'Per Student Budget': 0,
 'Average Math Score': 76.62941378128214,
 'Average Reading Score': 81.18272197463148,
 '% passing math': 65.68392183750429,
 '% passing reading': 81.31642098045938,
 '% overall passing': 53.51388412752828}

In [53]:
stats_by_school_df


Unnamed: 0,School Name,School Type,Total Students,Total Budget,Per Student Budget,Average Math Score,Average Reading Score,% passing math,% passing reading,% overall passing


In [54]:
stats_by_school_df = pd.concat([stats_by_school_df, pd.DataFrame.from_records(school_stats(school_data_complete[school_data_complete["school_name"] == "Huang High School"], stats_by_school_df))])

Huang High School


ValueError: If using all scalar values, you must pass an index

In [45]:
school_data_complete[school_data_complete["school_name"] == "Huang High School"]

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
2912,2912,Michael Combs,M,9th,Huang High School,98,95,0,District,2917,1910635
2913,2913,Monica Barajas,F,9th,Huang High School,64,81,0,District,2917,1910635
2914,2914,Carlos Garner,M,12th,Huang High School,66,73,0,District,2917,1910635
2915,2915,April Williams,F,12th,Huang High School,70,85,0,District,2917,1910635
