#### Project 4: explore Stanford's school enrollment data

##### 1) create BQ dataset:

In [7]:
daetaset_id = "school_enrollments"

In [8]:
!bq --location=US mk --dataset {dataset_id}

BigQuery error in mk operation: Invalid dataset ID "{dataset_id}". Dataset IDs
must be alphanumeric (plus underscores and dashes) and must be at most 1024
characters long.


##### 2) create and populate BQ tables:

In [5]:
import os

In [6]:
gsutil_cmd = "gsutil ls gs://cs327e-open-access/school-enrollments/*"
file_listings = os.popen(gsutil_cmd)

for file in file_listings:

    start_index = file.rindex("_") + 1
    end_index = file.rindex(".")
    table = file[start_index:end_index]
    
    if table in ("co", "district", "fl", "ga", "id", "ma", "mo", "ne", "ny", "or", "school", "sd", "wi"):
        print("skipping " + table)
        continue

    bq_cmd = "bq --location=US load --autodetect --skip_leading_rows=1 "\
             "--source_format=CSV " + dataset_id + "." + table + ' ' + file
    print(bq_cmd)
    
    os.system(bq_cmd)

NameError: name 'dataset_id' is not defined

#### 3) Find school enrollments by school name, grade level, and year for Texas 

In [None]:
!bq show --schema=true {dataset_id}.tx

#### Note: STRING type for "total" column

In [49]:
%%bigquery
select school, grade, year, sum(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where total != "" and school != "" and district like '%AUSTIN%'
group by school, grade, year
order by school, grade, year

Query complete after 0.01s: 100%|██████████| 3/3 [00:00<00:00, 1968.85query/s]                        
Downloading: 100%|██████████| 1739/1739 [00:01<00:00, 1319.61rows/s]


Unnamed: 0,school,grade,year,total
0,AKINS H S,grade_10,2020,2880
1,AKINS H S,grade_10,2021,2988
2,AKINS H S,grade_11,2020,2656
3,AKINS H S,grade_11,2021,2652
4,AKINS H S,grade_12,2020,2436
...,...,...,...,...
1734,ZILKER EL,grade_5,2021,276
1735,ZILKER EL,kindergarten,2020,308
1736,ZILKER EL,kindergarten,2021,252
1737,ZILKER EL,pre_kindergarten,2020,120


##### Note: BQ string functions documentation: 
##### https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions

In [50]:
!bq --location=US mk --dataset "views"

BigQuery error in mk operation: Dataset 'speedy-volt-324118:views' already
exists.


In [62]:
%%bigquery
create view views.austin_enrollments_by_school_grade_year as
select school, grade, year, sum(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where total != "" and school != "" and district like '%AUSTIN%'
group by school, grade, year
order by school, grade, year

Executing query with job ID: c297ecba-3558-46d4-9bfd-395b15780776
Query executing: 0.23s


ERROR:
 409 Already Exists: Table speedy-volt-324118:views.austin_enrollments_by_school_grade_year

(job ID: c297ecba-3558-46d4-9bfd-395b15780776)

                                    -----Query Job SQL Follows-----                                    

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:create view views.austin_enrollments_by_school_grade_year as
   2:select school, grade, year, sum(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
   3:from `speedy-volt-324118.school_enrollments.tx`
   4:where total != "" and school != "" and district like '%AUSTIN%'
   5:group by school, grade, year
   6:order by school, grade, year
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |


In [55]:
%%bigquery
select * from views.austin_enrollments_by_school_grade_year

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 1053.58query/s]
Downloading: 100%|██████████| 1739/1739 [00:02<00:00, 787.83rows/s]


Unnamed: 0,school,grade,year,total
0,AKINS H S,grade_10,2020,2880
1,AKINS H S,grade_10,2021,2988
2,AKINS H S,grade_11,2020,2656
3,AKINS H S,grade_11,2021,2652
4,AKINS H S,grade_12,2020,2436
...,...,...,...,...
1734,ZILKER EL,grade_5,2021,276
1735,ZILKER EL,kindergarten,2020,308
1736,ZILKER EL,kindergarten,2021,252
1737,ZILKER EL,pre_kindergarten,2020,120


#### 4) Open Data Studio and create chart from view:
##### https://datastudio.google.com/

#### Project 4 homework marker. Start homework from here. 

In [None]:
# the following takes the average of each grade over the years at zilker elementary

In [23]:
%%bigquery
select grade, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total 
from `speedy-volt-324118.school_enrollments.tx`
where school = "ZILKER EL" and school != '' and total != ''
group by grade
order by total

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 889.19query/s] 
Downloading: 100%|██████████| 8/8 [00:01<00:00,  6.49rows/s]


Unnamed: 0,grade,total
0,early_education,10.0
1,pre_kindergarten,21.5
2,kindergarten,70.0
3,grade_2,70.0
4,grade_1,74.0
5,grade_3,75.5
6,grade_5,78.5
7,grade_4,80.5


In [None]:
# the following takes the average school district population over the years for non charter schools and orders them by size

In [53]:
%%bigquery
select district, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where ccd_charter_school = false and total != ''
group by district
order by total

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1073.17query/s]                        
Downloading: 100%|██████████| 1008/1008 [00:00<00:00, 1124.06rows/s]


Unnamed: 0,district,total
0,DELL CITY ISD,10.000000
1,MARATHON ISD,10.000000
2,HIGGINS ISD,10.000000
3,VALENTINE ISD,10.000000
4,DIVIDE ISD,10.000000
...,...,...
1003,KILGORE ISD,267.300000
1004,FRIENDSWOOD ISD,268.847826
1005,LUMBERTON ISD,276.366667
1006,FLOUR BLUFF ISD,316.416667


In [None]:
# the following takes average number of students per county excluding non charter schools over the years and orders them by number of students.

In [55]:
%%bigquery
select county, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where ccd_charter_school = true and total != ''
group by county
order by total

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1602.92query/s]                        
Downloading: 100%|██████████| 51/51 [00:00<00:00, 56.11rows/s] 


Unnamed: 0,county,total
0,KENDALL COUNTY,10.65625
1,REAL COUNTY,11.596154
2,THROCKMORTON COUNTY,11.642857
3,COMAL COUNTY,12.153153
4,JOHNSON COUNTY,13.5
5,VAN ZANDT COUNTY,13.785714
6,PANOLA COUNTY,14.9
7,HAYS COUNTY,17.161765
8,HOOD COUNTY,17.25
9,HARRISON COUNTY,19.25


In [None]:
# the following takes the average number of students for each grade for non charter students and orders them by student number.

In [74]:
%%bigquery
select grade, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where ccd_charter_school = false and school != '' and total != ''
group by grade
order by total

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 1161.21query/s]
Downloading: 100%|██████████| 15/15 [00:01<00:00, 14.75rows/s]


Unnamed: 0,grade,total
0,early_education,13.177737
1,pre_kindergarten,61.952631
2,kindergarten,82.952497
3,grade_2,84.485615
4,grade_1,84.719317
5,grade_3,85.347177
6,grade_4,87.250568
7,grade_5,95.799594
8,grade_6,177.588742
9,grade_8,192.978473


In [None]:
# the following takes the average number of students over the years for charter schools in each district and it only shows those with an average greater then 200 for schools in texas.

In [164]:
%%bigquery
select district, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where ccd_charter_school = true and school != '' and total != ''
group by district
having total > 200
order by total

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 724.03query/s] 
Downloading: 100%|██████████| 3/3 [00:00<00:00,  3.48rows/s]


Unnamed: 0,district,total
0,HOUSTON ISD,247.010204
1,RICHLAND COLLEGIATE HIGH SCHOOL,256.25
2,ECTOR COUNTY ISD,451.833333


In [None]:
# the following takes the average number of students over the years for charter schools in each district and it only shows those with an average greater then 200 for schools in california

In [165]:
%%bigquery
select district, avg(total) as total
from `speedy-volt-324118.school_enrollments.ca`
where ccd_charter_school = true and school != ''
group by district
having total > 250
order by total

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1140.48query/s]                        
Downloading: 100%|██████████| 19/19 [00:01<00:00, 18.19rows/s]


Unnamed: 0,district,total
0,Roseland,206.928571
1,Apple Valley Unified,207.071429
2,Roseville Joint Union High,211.928571
3,Western Placer Unified,214.107143
4,San Jacinto Unified,215.571429
5,Spencer Valley Elementary,231.0
6,Westside Elementary,231.785714
7,Menifee Union Elementary,235.214286
8,Grossmont Union High,237.047619
9,San Lorenzo Valley Unified,238.714286


In [None]:
# the following takes the average number of students over the years for non charter schools that average a total greater then 250 and orders them by average number of students.

In [184]:
%%bigquery
select district, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where ccd_charter_school = false and school != '' and total != ''
group by district
having total > 250
order by total

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 925.08query/s] 
Downloading: 100%|██████████| 6/6 [00:01<00:00,  5.26rows/s]


Unnamed: 0,district,total
0,HUNTSVILLE ISD,250.3375
1,KILGORE ISD,267.3
2,FRIENDSWOOD ISD,268.847826
3,LUMBERTON ISD,276.366667
4,FLOUR BLUFF ISD,316.416667
5,HALLSVILLE ISD,460.844828


In [None]:
# the following takes the average student number over the years for non charter schools that have a average student population greater then 250

In [185]:
%%bigquery
select district, avg(total) as total
from `speedy-volt-324118.school_enrollments.ca`
where ccd_charter_school = false and school != ''
group by district
having total > 250
order by total

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 843.92query/s] 
Downloading: 100%|██████████| 7/7 [00:00<00:00,  7.24rows/s]


Unnamed: 0,district,total
0,Antelope Valley Union High,253.244048
1,Kern High,255.279503
2,Fremont Union High,259.833333
3,Modesto City High,270.357143
4,Huntington Beach Union High,280.3125
5,Chaffey Joint Union High,307.636364
6,Santa Maria Joint Union High,314.464286


In [None]:
# the following does a union between charter schools in texas and california for districts with an average of more then 250 students 

In [201]:
%%bigquery
select year, district, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where ccd_charter_school = true and school != '' and total != ''
group by year, district
having total > 250
union all
select year, district, avg(total) as total
from `speedy-volt-324118.school_enrollments.ca`
where ccd_charter_school = true and school != ''
group by year, district
having total > 250
order by total

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 858.08query/s] 
Downloading: 100%|██████████| 25/25 [00:00<00:00, 25.14rows/s]


Unnamed: 0,year,district,total
0,2021,Western Placer Unified,253.0
1,2021,San Lorenzo Valley Unified,254.857143
2,2020,RICHLAND COLLEGIATE HIGH SCHOOL,263.5
3,2020,Dehesa Elementary,273.685714
4,2021,Spencer Valley Elementary,275.071429
5,2021,Palmdale Elementary,276.5
6,2021,Dehesa Elementary,279.914286
7,2020,Palmdale Elementary,281.214286
8,2020,Beaumont Unified,283.571429
9,2021,HOUSTON ISD,293.190476


In [202]:
# the following does a union between non charter schools in texas and california for districts with an average of more then 250 students

In [203]:
%%bigquery
select year, state, district, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where ccd_charter_school = false and school != '' and total != ''
group by year, state, district
having total > 250
union all
select year, state, district, avg(total) as total
from `speedy-volt-324118.school_enrollments.ca`
where ccd_charter_school = false and school != ''
group by year, state, district
having total > 250
order by total

Query complete after 0.00s: 100%|██████████| 4/4 [00:00<00:00, 2158.67query/s]                        
Downloading: 100%|██████████| 27/27 [00:00<00:00, 30.24rows/s]


Unnamed: 0,year,state,district,total
0,2020,CA,Antelope Valley Union High,251.440476
1,2020,CA,Kern High,252.409938
2,2020,CA,Los Gatos-Saratoga Union High,252.428571
3,2021,CA,Antelope Valley Union High,255.047619
4,2020,TX,STEPHENVILLE ISD,256.066667
5,2021,CA,Fremont Union High,257.0
6,2021,CA,Kern High,258.149068
7,2020,CA,Fremont Union High,262.666667
8,2021,TX,KILGORE ISD,263.733333
9,2021,TX,FRIENDSWOOD ISD,266.956522


In [204]:
%%bigquery
create view views.tx_ca_charter_enrollments_by_district as
select year, state, district, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where ccd_charter_school = true and school != '' and total != ''
group by year, state, district
having total > 250
union all
select year, state, district, avg(total) as total
from `speedy-volt-324118.school_enrollments.ca`
where ccd_charter_school = true and school != ''
group by year, state, district
having total > 250
order by total

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 1193.60query/s]


In [205]:
%%bigquery
create view views.tx_ca_noncharter_enrollments_by_district as
select year, state, district, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
from `speedy-volt-324118.school_enrollments.tx`
where ccd_charter_school = false and school != '' and total != ''
group by year, state, district
having total > 250
union all
select year, state, district, avg(total) as total
from `speedy-volt-324118.school_enrollments.ca`
where ccd_charter_school = false and school != ''
group by year, state, district
having total > 250
order by total

Executing query with job ID: b68956f0-1ef8-4d72-a7ad-bfcaa4474b80
Query executing: 0.21s


ERROR:
 409 Already Exists: Table speedy-volt-324118:views.tx_ca_noncharter_enrollments_by_district

(job ID: b68956f0-1ef8-4d72-a7ad-bfcaa4474b80)

                                     -----Query Job SQL Follows-----                                     

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:create view views.tx_ca_noncharter_enrollments_by_district as
   2:select year, state, district, avg(cast(replace(replace(total, ",", ""), "<", "") as int64)) as total
   3:from `speedy-volt-324118.school_enrollments.tx`
   4:where ccd_charter_school = false and school != '' and total != ''
   5:group by year, state, district
   6:having total > 250
   7:union all
   8:select year, state, district, avg(total) as total
   9:from `speedy-volt-324118.school_enrollments.ca`
  10:where ccd_charter_school = false and school != ''
  11:group by year, state, district
  12:having total > 250
  13:order by total
    |    .    |    .  