In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [2]:
# df.describe
# Using the describe function on a data frame yields a very statistical result that will tell you all that you need to know about each column’s values independently. This is a great way to understand where most of the data in a given column sits without only needing to consider the mean. Having an idea of the standard deviation, min-max values, and the mean will give a great inclination as to how much variance is in the data, as well. 

school_data_complete.describe() 

Unnamed: 0,Student ID,reading_score,math_score,School ID,size,budget
count,39170.0,39170.0,39170.0,39170.0,39170.0,39170.0
mean,19584.5,81.87784,78.985371,6.978172,3332.95711,2117241.0
std,11307.549359,10.23958,12.309968,4.444329,1323.914069,874998.7
min,0.0,63.0,55.0,0.0,427.0,248087.0
25%,9792.25,73.0,69.0,3.0,1858.0,1081356.0
50%,19584.5,82.0,79.0,7.0,2949.0,1910635.0
75%,29376.75,91.0,89.0,11.0,4635.0,3022020.0
max,39169.0,99.0,99.0,14.0,4976.0,3124928.0


In [3]:
# df.groupby
# The groupby() function is an awesome function to use to re-organize your observations based on categorical values or continuous ranks by numerical value. The function will simply put identical or similar values closest together.

school_data_group = school_data_complete.groupby(['grade', 'type'])['math_score'].mean()
school_data_group

grade  type    
10th   Charter     83.513919
       District    76.844499
11th   Charter     83.516814
       District    77.036037
12th   Charter     83.451482
       District    76.973142
9th    Charter     83.177182
       District    77.080761
Name: math_score, dtype: float64

In [4]:
school_data_group = pd.DataFrame(school_data_group)
school_data_group 

Unnamed: 0_level_0,Unnamed: 1_level_0,math_score
grade,type,Unnamed: 2_level_1
10th,Charter,83.513919
10th,District,76.844499
11th,Charter,83.516814
11th,District,77.036037
12th,Charter,83.451482
12th,District,76.973142
9th,Charter,83.177182
9th,District,77.080761


In [5]:
# df.query()
school_query = school_data_complete.query("math_score < 70 & reading_score < 80")
school_query

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
18,18,Kevin Stevens,M,9th,Huang High School,64,69,0,District,2917,1910635
28,28,Kelly James,F,11th,Huang High School,73,55,0,District,2917,1910635
37,37,Jesse Newton,M,10th,Huang High School,63,66,0,District,2917,1910635
40,40,Matthew Mayer,M,9th,Huang High School,79,69,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39020,39020,Sandra Mcdowell,F,12th,Thomas High School,76,69,14,Charter,1635,1043130
39058,39058,Kenneth Moore,M,12th,Thomas High School,70,69,14,Charter,1635,1043130
39061,39061,Jason Rodgers,M,9th,Thomas High School,76,69,14,Charter,1635,1043130
39109,39109,Ernest Lawrence,M,11th,Thomas High School,69,69,14,Charter,1635,1043130


In [6]:
school_query.describe()

Unnamed: 0,Student ID,reading_score,math_score,School ID,size,budget
count,4400.0,4400.0,4400.0,4400.0,4400.0,4400.0
mean,19718.512727,71.203864,62.415227,7.025682,3915.164545,2510879.0
std,11949.409138,4.893132,4.494557,4.678772,1021.418447,661061.9
min,3.0,63.0,55.0,0.0,427.0,248087.0
25%,8662.5,67.0,58.0,3.0,2917.0,1884411.0
50%,20788.5,71.0,63.0,7.0,3999.0,2547363.0
75%,30600.25,75.0,67.0,12.0,4761.0,3094650.0
max,39116.0,79.0,69.0,14.0,4976.0,3124928.0


In [7]:
# df[conditional_mask]
# You can mask data frames. Come up with a condition and then apply it to the entire data frame

my_condition = school_data_complete['budget'] < 2500000

conditional_df = school_data_complete[my_condition]
conditional_df

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [8]:
conditional_df.budget.max()

1910635

In [9]:
school_data_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39170 entries, 0 to 39169
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Student ID     39170 non-null  int64 
 1   student_name   39170 non-null  object
 2   gender         39170 non-null  object
 3   grade          39170 non-null  object
 4   school_name    39170 non-null  object
 5   reading_score  39170 non-null  int64 
 6   math_score     39170 non-null  int64 
 7   School ID      39170 non-null  int64 
 8   type           39170 non-null  object
 9   size           39170 non-null  int64 
 10  budget         39170 non-null  int64 
dtypes: int64(6), object(5)
memory usage: 3.6+ MB


In [10]:
# Memory Usage by columns
school_memory_usage = school_data_complete.memory_usage()
school_memory_usage

Index            313360
Student ID       313360
student_name     313360
gender           313360
grade            313360
school_name      313360
reading_score    313360
math_score       313360
School ID        313360
type             313360
size             313360
budget           313360
dtype: int64

In [11]:
school_data_complete.reset_index(inplace=True)

In [12]:
# series.isin() it return true or false if a giving condition/observation is created. 

school_data_complete

Unnamed: 0,index,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [14]:
school_data_complete.student_name.isin(['Rebecca Tanner']) # but happen if I put the last name

0        False
1        False
2        False
3        False
4        False
         ...  
39165    False
39166    False
39167     True
39168    False
39169    False
Name: student_name, Length: 39170, dtype: bool

In [15]:
# where function 
school_where = school_data_complete['school_name'].where(school_data_complete['school_name'] != "Huang High School" )
school_where_df = pd.DataFrame(school_where)

In [16]:
school_where_df

Unnamed: 0,school_name
0,
1,
2,
3,
4,
...,...
39165,Thomas High School
39166,Thomas High School
39167,Thomas High School
39168,Thomas High School
