# PyCity Schools Analysis

In [1]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path

# File to Load (Remember to Change These)
school_data_to_load = Path("../Resources/schools_complete.csv")
student_data_to_load = Path("../Resources/students_complete.csv")

# Read School and Student Data File and store into Pandas DataFrames
schools_df = pd.read_csv(school_data_to_load)
student_df = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
data_complete = pd.merge(student_df, schools_df, how="outer", on=["school_name", "school_name"])

Huang, Figueroa, Shelton, Hernandez, Griffin, Wilson, Cabrera, Bailey, Holden, Pena, Wright, Rodriguez, Johnson, Ford, Thomas

There are 15 schools - store in a variable

In [2]:
school_count = len(schools_df["school_name"])

data_complete["Student ID"].value_counts()
data_complete.describe()

this shows 39,170 students

length of the Student ID column is 39170   same for all columns

In [3]:
student_count = len(student_df["Student ID"].unique())
print(student_count)

student_name_count = len(student_df["student_name"].unique())
print(student_name_count)

39170
32715


there are fewer unique student names than there are student IDs
I'm sure that some students have the same name, but this is a big difference

In [4]:
data_inner = pd.merge(student_df, schools_df, on=["school_name"])


get enrollment count at each school from the size column

In [5]:
enrollments= schools_df["size"].unique()
print(enrollments)
enrollments_total = enrollments.sum()
print(enrollments_total)



[2917 2949 1761 4635 1468 2283 1858 4976  427  962 1800 3999 4761 2739
 1635]
39170


the sum of the individual sizes of the schools is 39170
student_count = 39170

let's do the same thing for the budget

In [6]:
budgets_all = schools_df["budget"].unique()
print(budgets_all)

budgets_total = budgets_all.sum()
print(budgets_total)


[1910635 1884411 1056600 3022020  917500 1319574 1081356 3124928  248087
  585858 1049400 2547363 3094650 1763916 1043130]
24649428


There are 15 unique budget values.  This is as expected
The total is 24649428 stored in budgets_total

for the average math scores, we'll want to look at the merged datafile

In [7]:
avg_reading_score = data_complete["reading_score"].mean()
print(avg_reading_score)

avg_math_score = data_complete["math_score"].mean()
print(avg_math_score)

81.87784018381414
78.98537145774827


overall average math score stored in avg_math_score
overall average reading score stored in ave_reading_score

Creating a new column for pass or fail math score with 60 and higher being pass and lower being fail
and one for reading also

In [20]:

data_complete["math_pass"] = data_complete["math_score"].apply(lambda x:"pass" if x>=70 else "fail")

data_complete["reading_pass"] = data_complete["reading_score"].apply(lambda x:"pass" if x>=70 else "fail")

math_num_pass = data_complete["math_pass"].value_counts()
reading_num_pass = data_complete["reading_pass"].value_counts()

dist_num_pass_math = math_num_pass.iloc[0]
dist_num_pass_reading = reading_num_pass.iloc[0]


print(dist_num_pass_math) 
print(dist_num_pass_reading)

pass    36211
fail     2959
Name: math_pass, dtype: int64
pass    39170
Name: reading_pass, dtype: int64


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,math_pass,reading_pass
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,pass,pass
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,pass,pass
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,pass,pass
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,fail,pass
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,pass,pass


get percents passing
math_percent_pass
reading_percent_pass
overall_percent_pass

In [22]:
math_percent_pass = (dist_num_pass_math / student_count)*100
reading_percent_pass = (dist_num_pass_reading / student_count)*100

def get_status(row):
    if row["math_pass"] == "pass" and row["reading_pass"] == "pass":
        return "pass"
    else:
        return "fail"

data_complete["overall pass"] = data_complete.apply(lambda row: get_status(row), axis=1)

overall_num_pass = data_complete["overall pass"].value_counts()

dist_num_pass_overall = overall_num_pass.iloc[0]

overall_percent_pass = (dist_num_pass_overall / student_count)*100

print(math_percent_pass)
print(reading_percent_pass)
print(overall_percent_pass)


74.9808526933878
85.80546336482001
65.17232575950983


Create District Summary Dataframe

In [23]:
district_summary = pd.DataFrame([{"Total Schools":school_count,"Total Students":student_count,
                                  "Total Budget":budgets_total,"Average Math Score": avg_math_score,
                                  "Average Reading Score":avg_reading_score,"% Passing Math":math_percent_pass,
                                  "% Passing Reading":reading_percent_pass,"% Overall Passing":overall_percent_pass}])

district_summary["Total Students"] = district_summary["Total Students"].map("{:,}".format)
district_summary["Total Budget"] = district_summary["Total Budget"].map("${:,.2f}".format)

district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,65.172326
