In [1]:
# Dependencies and Setup
import os
import csv
import math
import statistics
import pandas as pd
import numpy as np

In [2]:
# File to Load
school_data_row = "Resources/schools_complete.csv"
student_data_row = "Resources/students_complete.csv"

In [3]:
# Read School and Student Data File and store into Pandas Data Frames
school_df = pd.read_csv(school_data_row)

# Read School and Student Data File and store into Pandas Data Frames
student_df = pd.read_csv(student_data_row)

# Combine the data into a single dataset
school_data_complete = pd.merge(student_df, school_df, how="left", on=["school_name", "school_name"])

# See as a DataFrame and check it out
data_df = pd.DataFrame(school_data_complete)
data_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [4]:
# Looking for the ones that pass math
pass_math_df = data_df[data_df["math_score"]>=70]
pass_math_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
8,8,Michael Roth,M,10th,Huang High School,95,87,0,District,2917,1910635


In [5]:
pass_math_df["Student ID"].nunique()

29370

In [6]:
percentage_pass_math = (pass_math_df["Student ID"].nunique() / data_df["Student ID"].nunique())*100
percentage_pass_math

74.9808526933878

In [7]:
# Looking for the ones that pass reading
pass_reading_df = data_df[data_df["reading_score"]>=70]
pass_reading_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635


In [8]:
pass_reading_df["Student ID"].nunique()

33610

In [9]:
percentage_pass_reading = (pass_reading_df["Student ID"].nunique() / data_df["Student ID"].nunique())*100
percentage_pass_reading

85.80546336482001

In [10]:
percentage_overall_passing = (percentage_pass_math + percentage_pass_reading)/2
percentage_overall_passing

80.39315802910392

In [11]:
school_df["budget"].sum()

24649428

In [12]:
# Creating a summary DataFrame using the values found
district_summary_df = pd.DataFrame({
    "Total Schools":[data_df["school_name"].nunique()],
    "Total Students" : [data_df["Student ID"].nunique()],
    "Total Budget" : [school_df["budget"].sum()],
    "Average Math Score" : [data_df["math_score"].mean()],
    "Average Reading Score" : [data_df["reading_score"].mean()],
    "% Passing Math":[percentage_pass_math],
    "% Passing Reading": [percentage_pass_reading],
    "% Overall Passing Rate":[percentage_overall_passing]
})

district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,80.393158


In [None]:
# Checking the kind of variables
data_df.dtypes

In [None]:
# Using GroupBy in order to separate the data into fields according to "school" values
grouped_schools_df = data_df.groupby(["school_name","type"])
grouped_schools_df.agg(["Student ID"].count())

#grouped_schools_df[total_students].aggregate()
#grouped_schools_df.describe().head(15)
# In order to be visualized, a data function must be used
grouped_schools_df.count().head(15)

In [None]:
# The numeric variables can be now calculated per school 
# Total Students
total_students = grouped_schools_df["Student ID"].count()
total_students.head(15)

In [None]:
# The numeric variables can be now calculated per school 
# Average Math Score
school_average_math = grouped_schools_df["math_score"].mean()
school_average_math.head(15)

In [None]:
# The numeric variables can be now calculated per school 
# Average Reading Score
school_average_reading = grouped_schools_df["reading_score"].mean()
school_average_reading.head(15)

In [None]:
# Budget per school
budget_per_school = grouped_schools_df["budget"].max()
budget_per_school.head(15)

In [None]:
# Per student budget 
budget_per_student = budget_per_school/total_students
budget_per_student.head(15)

In [None]:
# Looking for the ones that pass math per school
pass_math_per_school = pass_math_df.groupby(["school_name"])

# In order to get the percentage passing math
perc_pass_math = (pass_math_per_school["Student ID"].count() / grouped_schools_df["Student ID"].count())*100
perc_pass_math.head(15)

In [None]:
# Looking for the ones that pass reading per school
pass_reading_per_school = pass_reading_df.groupby(["school_name"])

# In order to get the percentage passing reading
perc_pass_reading = (pass_reading_per_school["Student ID"].count()/grouped_schools_df["Student ID"].count())*100
perc_pass_reading.head(15)

In [None]:
# Overall Passing Rate (Average of math and reading)
perc_overall_passing = (perc_pass_math+perc_pass_reading)/2
perc_overall_passing.head(15)

In [None]:
# Create a new column that put together all the data calculated
school_summary_df = pd.DataFrame({
    "Total Students" : [{school_average_math}],
    "Average Math Score" : [{school_average_math}]
})
school_summary_df.head(15)

In [None]:
# VERIFY IF PRINTING RESULTS IS GOING TO WORK -- Creating a SCHOOL summary DataFrame using the values found
#"% Passing Math":[perc_pass_math_value["% Passing Math"].],
#    "% Passing Reading": [percentage_pass_reading],
#    "% Overall Passing Rate":[percentage_overall_passing]
#"School Name":[grouped_schools_df["school_name"]],
#    "Total Students" : [grouped_schools_df["Student ID"].count()],

school_summary_df = pd.DataFrame({
    "Average Math Score" : [school_average_math],
    "Average Reading Score" : [school_average_reading]
})

school_summary_df.head()

In [None]:
# BEFORE HERE Looking for the ones that pass math per school
pass_math_per_school = pass_math_df.groupby(["school_name"])
pass_math_per_school.count().head(15)

In [None]:
# In order to get the percentage passing math
perc_pass_math = (pass_math_per_school.count() / grouped_schools_df.count())
perc_pass_math_value = perc_pass_math[["Student ID"]]*100 
perc_pass_math_value = perc_pass_math_value.rename(columns={"Student ID":"% Passing Math"})
perc_pass_math_value.head(15)

In [None]:
# Looking for the ones that pass reading per school
pass_reading_per_school = pass_reading_df.groupby(["school_name"])
pass_reading_per_school.count().head(15)

In [None]:
# In order to get the percentage passing reading
perc_pass_reading = (pass_reading_per_school.count() / grouped_schools_df.count())
perc_pass_reading_value = perc_pass_reading[["Student ID"]]*100 
perc_pass_reading_value = perc_pass_reading_value.rename(columns={"Student ID":"% Passing Reading"})
perc_pass_reading_value.head(15)

In [None]:
# What kind of variables are the percentages I got?
perc_pass_math_value.dtypes

In [None]:
# What kind of variables are the percentages I got?
perc_pass_reading_value.dtypes

In [None]:
# The numeric variables can be now calculated per school 
# Average Math Score
grouped_schools_df["school_average_math"] = grouped_schools_df["math_score"].mean(axis=1)
grouped_schools_df.head(15)

In [None]:
# Calculate the overall Passing rate per school
# Place the results into a new column
summary_passing_per_school = pd.merge([perc_pass_math_value["% Passing Math"]], 
                                      [perc_pass_reading_value["% Passing Reading"]]
                                     )
summary_passing_per_school.head(15)

In [None]:
# Set new index to school name
school_df = data_df.set_index("school_name")
school_df.head()

In [None]:
# School Summary
# Create an overview table that summarizes key metrics about each school, including:
# School Name
# School Type - DONE
# Total Students - DONE
# Total School Budget - DONE
# Per Student Budget - DONE
# Average Math Score - DONE
# Average Reading Score - DONE
# % Passing Math - DONE
# % Passing Reading - DONE
# Overall Passing Rate (Average of the above two)- DONE