In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize


from sqlalchemy import create_engine

## Extracting the Data into Pandas

In [7]:
# Import 2006-2012 NYC School demographic data

demo_file = "Resources/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv"
demo_data_df = pd.read_csv(demo_file)
demo_data_df.head()

Unnamed: 0,DBN,Name,schoolyear,fl_percent,frl_percent,total_enrollment,prek,k,grade1,grade2,...,black_num,black_per,hispanic_num,hispanic_per,white_num,white_per,male_num,male_per,female_num,female_per
0,01M015,P.S. 015 ROBERTO CLEMENTE,20052006,89.4,,281,15,36,40,33,...,74,26.3,189,67.3,5,1.8,158.0,56.2,123.0,43.8
1,01M015,P.S. 015 ROBERTO CLEMENTE,20062007,89.4,,243,15,29,39,38,...,68,28.0,153,63.0,4,1.6,140.0,57.6,103.0,42.4
2,01M015,P.S. 015 ROBERTO CLEMENTE,20072008,89.4,,261,18,43,39,36,...,77,29.5,157,60.2,7,2.7,143.0,54.8,118.0,45.2
3,01M015,P.S. 015 ROBERTO CLEMENTE,20082009,89.4,,252,17,37,44,32,...,75,29.8,149,59.1,7,2.8,149.0,59.1,103.0,40.9
4,01M015,P.S. 015 ROBERTO CLEMENTE,20092010,,96.5,208,16,40,28,32,...,67,32.2,118,56.7,6,2.9,124.0,59.6,84.0,40.4


In [21]:
# Import SAT data for NYC Schools

SAT_file = "Resources/2012_SAT_Results.csv"
SAT_data_df = pd.read_csv(SAT_file)
SAT_data_df.head()



Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384


In [2]:
# Import JSON AP test data (use json.load, and specifying the data area of the JSON,
# to account for the "s" values in dataset causing a value error
# [ValueError: Mixing dicts with non-Series may lead to ambiguous ordering])

AP_json = json.load(open('Resources/NYC_AP_Scores.json'))
AP_data_df = pd.DataFrame(AP_json["data"])
AP_data_df.head()

# # Filter columns
AP_data_df = AP_data_df[[8, 10, 11, 12]]

# # Rename columns
AP_data_df.rename(columns={8: 'DBN', 10:'Num of AP Test Takers', 11:'Num of AP Total Exams Taken',\
12:'Num of AP Exams Passed'}, inplace=True)

AP_data_df.head()

Unnamed: 0,DBN,Num of AP Test Takers,Num of AP Total Exams Taken,Num of AP Exams Passed
0,01M292,s,s,s
1,01M448,37,53,21
2,01M450,12,12,s
3,01M458,s,s,s
4,01M509,14,15,s


## Transformation of the data

In [16]:
#Filtering down the Demographic data 

new_demo_df = demo_data_df[["DBN", "Name", "schoolyear", "total_enrollment", "grade9", "grade10", "grade11", "grade12", 
                            "black_num", "black_per", "hispanic_num", "hispanic_per", "white_num", "white_per",
                           "male_num", "male_per", "female_num" , "female_per"]].copy()
new_demo_df.head()

Unnamed: 0,DBN,Name,schoolyear,total_enrollment,grade9,grade10,grade11,grade12,black_num,black_per,hispanic_num,hispanic_per,white_num,white_per,male_num,male_per,female_num,female_per
0,01M015,P.S. 015 ROBERTO CLEMENTE,20052006,281,,,,,74,26.3,189,67.3,5,1.8,158.0,56.2,123.0,43.8
1,01M015,P.S. 015 ROBERTO CLEMENTE,20062007,243,,,,,68,28.0,153,63.0,4,1.6,140.0,57.6,103.0,42.4
2,01M015,P.S. 015 ROBERTO CLEMENTE,20072008,261,,,,,77,29.5,157,60.2,7,2.7,143.0,54.8,118.0,45.2
3,01M015,P.S. 015 ROBERTO CLEMENTE,20082009,252,,,,,75,29.8,149,59.1,7,2.8,149.0,59.1,103.0,40.9
4,01M015,P.S. 015 ROBERTO CLEMENTE,20092010,208,,,,,67,32.2,118,56.7,6,2.9,124.0,59.6,84.0,40.4


In [17]:
#Filter the Demographic data down to the 2011-2012 school year


filtered_demo_df = new_demo_df.loc[new_demo_df['schoolyear'] == 20112012]
filtered_demo_df.head()

Unnamed: 0,DBN,Name,schoolyear,total_enrollment,grade9,grade10,grade11,grade12,black_num,black_per,hispanic_num,hispanic_per,white_num,white_per,male_num,male_per,female_num,female_per
6,01M015,P.S. 015 ROBERTO CLEMENTE,20112012,189,,,,,63,33.3,109,57.7,4,2.1,97.0,51.3,92.0,48.7
13,01M019,P.S. 019 ASHER LEVY,20112012,328,,,,,81,24.7,158,48.2,28,8.5,147.0,44.8,181.0,55.2
20,01M020,PS 020 ANNA SILVER,20112012,626,,,,,55,8.8,357,57.0,16,2.6,330.0,52.7,296.0,47.3
27,01M034,PS 034 FRANKLIN D ROOSEVELT,20112012,401,,,,,90,22.4,275,68.6,8,2.0,204.0,50.9,197.0,49.1
35,01M063,PS 063 WILLIAM MCKINLEY,20112012,176,,,,,41,23.3,110,62.5,15,8.5,97.0,55.1,79.0,44.9


In [23]:
#Insert the schoolyear into the SAT dataframe
#The user is asked to confirm the school year or manually correct it.


SAT_file_date = int(SAT_file[10:14])

SAT_append = SAT_file_date - 1

SAT_schoolyear = str(SAT_append) + str(SAT_file_date)

user_schoolyear = input(f"Is this school year correct(Y/N)? {SAT_schoolyear} ")

if user_schoolyear.lower() == "y":

    SAT_data_df['School Year']= SAT_schoolyear
    SAT_data_df.head()
else:
    user_correction = input("Please input the schoolyear (YYYYYYYY) ")
    SAT_data_df['School Year']= user_correction
    SAT_data_df.head()
    
SAT_data_df.head()

20112012


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,School Year
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,20122013
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,20122013
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,20122013
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,20122013
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,20122013


### Use SQLalchemy to connect to the SQL database

In [19]:
#Connect to local databases

import pymysql
pymysql.install_as_MySQLdb()


# SQL_Table = 'nycschool_db'
rds_connection_string = "root:15Loans#@127.0.0.1/nycschool_db"
engine = create_engine(f'mysql://{rds_connection_string}')

In [20]:
# Check for table names

engine.table_names()

['ap_2012', 'demographic_data', 'sat_2012']

In [15]:
# Import the SAT scores dataframe into SQL

SAT_data_df.to_sql(name='sat_2012', con=engine, if_exists='replace', index=False)

In [21]:
# Import the AP scores dataframe into SQL

AP_data_df.to_sql(name='ap_2012', con=engine, if_exists='replace', index=False)

In [22]:
# Import the demographic dataframe into SQL

filtered_demo_df.to_sql(name='demographic_data', con=engine, if_exists='replace', index=False)

In [23]:
pd.read_sql_query('select * from sat_2012', con=engine).head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384
