In [1]:
import pandas as pd
from sqlalchemy import create_engine

## Extracting the Data into Pandas

In [15]:
# Import 2006-2012 NYC School demographic data

demo_file = "Resources/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv"
demo_data_df = pd.read_csv(demo_file)
demo_data_df.head()

Unnamed: 0,DBN,Name,schoolyear,fl_percent,frl_percent,total_enrollment,prek,k,grade1,grade2,...,black_num,black_per,hispanic_num,hispanic_per,white_num,white_per,male_num,male_per,female_num,female_per
0,01M015,P.S. 015 ROBERTO CLEMENTE,20052006,89.4,,281,15,36,40,33,...,74,26.3,189,67.3,5,1.8,158.0,56.2,123.0,43.8
1,01M015,P.S. 015 ROBERTO CLEMENTE,20062007,89.4,,243,15,29,39,38,...,68,28.0,153,63.0,4,1.6,140.0,57.6,103.0,42.4
2,01M015,P.S. 015 ROBERTO CLEMENTE,20072008,89.4,,261,18,43,39,36,...,77,29.5,157,60.2,7,2.7,143.0,54.8,118.0,45.2
3,01M015,P.S. 015 ROBERTO CLEMENTE,20082009,89.4,,252,17,37,44,32,...,75,29.8,149,59.1,7,2.8,149.0,59.1,103.0,40.9
4,01M015,P.S. 015 ROBERTO CLEMENTE,20092010,,96.5,208,16,40,28,32,...,67,32.2,118,56.7,6,2.9,124.0,59.6,84.0,40.4


In [11]:
# Import SAT data for NYC Schools

SAT_file = "Resources/2012_SAT_Results.csv"
SAT_data_df = pd.read_csv(SAT_file)
SAT_data_df.head()



Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384


In [8]:
#Had to save the dataset as CSV, remove all of the null("s") values and then convert the csv to JSON @ https://www.csvjson.com/csv2json 


# Import AP Scores JSON

AP_file = "Resources/AP2.json"
AP_data_df = pd.read_json(AP_file)
AP_data_df.head()


Unnamed: 0,DBN,Num of AP Exam Paed,Num of AP Tet Taker,Num of AP Total Exam Taken,SCHOOL NAME
0,01M292,,,,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES
1,01M448,21.0,37.0,53.0,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL
2,01M450,,12.0,12.0,EAST SIDE COMMUNITY SCHOOL
3,01M458,,,,FORSYTH SATELLITE ACADEMY
4,01M509,,14.0,15.0,MARTA VALLE HIGH SCHOOL


## Transformation of the data

In [16]:
#Filtering down the Demographic data 

new_demo_df = demo_data_df[["DBN", "Name", "schoolyear", "total_enrollment", "grade12", "black_num", "black_per", "hispanic_num", "hispanic_per", "white_num", "white_per" ]].copy()
new_demo_df.head()

Unnamed: 0,DBN,Name,schoolyear,total_enrollment,grade12,black_num,black_per,hispanic_num,hispanic_per,white_num,white_per
0,01M015,P.S. 015 ROBERTO CLEMENTE,20052006,281,,74,26.3,189,67.3,5,1.8
1,01M015,P.S. 015 ROBERTO CLEMENTE,20062007,243,,68,28.0,153,63.0,4,1.6
2,01M015,P.S. 015 ROBERTO CLEMENTE,20072008,261,,77,29.5,157,60.2,7,2.7
3,01M015,P.S. 015 ROBERTO CLEMENTE,20082009,252,,75,29.8,149,59.1,7,2.8
4,01M015,P.S. 015 ROBERTO CLEMENTE,20092010,208,,67,32.2,118,56.7,6,2.9


In [18]:
#Filter the Demographic data down to the 2011-2012 school year


filtered_demo_df = new_demo_df.loc[new_demo_df['schoolyear'] == 20112012]
filtered_demo_df.head()

Unnamed: 0,DBN,Name,schoolyear,total_enrollment,grade12,black_num,black_per,hispanic_num,hispanic_per,white_num,white_per
6,01M015,P.S. 015 ROBERTO CLEMENTE,20112012,189,,63,33.3,109,57.7,4,2.1
13,01M019,P.S. 019 ASHER LEVY,20112012,328,,81,24.7,158,48.2,28,8.5
20,01M020,PS 020 ANNA SILVER,20112012,626,,55,8.8,357,57.0,16,2.6
27,01M034,PS 034 FRANKLIN D ROOSEVELT,20112012,401,,90,22.4,275,68.6,8,2.0
35,01M063,PS 063 WILLIAM MCKINLEY,20112012,176,,41,23.3,110,62.5,15,8.5


In [6]:
#Clean up the null values from the SAT dataframe

Cleaned_SAT_df = SAT_data_df.loc[SAT_data_df['Num of SAT Test Takers'] != "s"]
Cleaned_SAT_df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384


In [23]:
#Clean up the null values from the AP test dataframe

Cleaned_AP_df = AP_data_df.loc[AP_data_df['Num of AP Total Exam Taken'] != ""]
Cleaned_AP_df.head()

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(Cleaned_AP_df)

Unnamed: 0,DBN,Num of AP Exam Paed,Num of AP Tet Taker,Num of AP Total Exam Taken,SCHOOL NAME
1,01M448,21.0,37,53,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL
2,01M450,,12,12,EAST SIDE COMMUNITY SCHOOL
4,01M509,,14,15,MARTA VALLE HIGH SCHOOL
5,01M515,54.0,50,60,LOWER EAST SIDE PREPARATORY HIGH SCHOOL
6,01M539,323.0,306,587,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ..."


In [24]:
#Connect to local databases

SQL_Table = ''
rds_connection_string = "root:15Loans#@127.0.0.1/{SQL_Table}"
engine = create_engine(f'mysql://{rds_connection_string}')

In [None]:
# Check for tables

engine.table_names()