
# =========================================================
# Analysis of NYC High Schools: Enrollment, Language Learners, and Special Education
# =========================================================

In [145]:
import pandas as pd
import psycopg2

# =========================================================
## 1. DATABASE CONNECTION 
# =========================================================

In [146]:
# DB connection setup using hardcoded credentials (for onboarding only)
conn = psycopg2.connect(
    dbname="neondb",
    user="neondb_owner",
    password="a9Am7Yy5r9_T7h4OF2GN",
    host="ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech",
    port="5432",
    sslmode="require"
)
cur = conn.cursor()

# =========================================================
## 2. SCHOOL DISTRIBUTION PER BOROUGH
# =========================================================

In [None]:
# --- Query 1: Number of schools per borough ---
sql_query_school_distribution = """
SELECT borough, COUNT(DISTINCT dbn) AS schools_count
FROM nyc_schools.high_school_directory
GROUP BY borough
ORDER BY borough;
"""
# Explanation:
# Counts the number of distinct high schools in each NYC borough.
# GROUP BY aggregates the data by borough, and ORDER BY ensures the results are sorted alphabetically by borough.

df_school_distribution = pd.read_sql(sql_query_school_distribution, conn)

print("===== Number of schools in each borough =====")
print(df_school_distribution)

  df_school_distribution = pd.read_sql(sql_query_school_distribution, conn)


===== Number of schools in each borough =====
         borough  schools_count
0          Bronx            118
1       Brooklyn            121
2      Manhattan            106
3         Queens             80
4  Staten Island             10


# =========================================================
## 3. AVERAGE PERCENTAGE OF ENGLISH LANGUAGE LEARNERS (ELL)
# =========================================================


In [None]:
# --- Query 2: Average % of English Language Learners (ELL) per borough ---
sql_query_ell_avg_per_borough = """
SELECT hsd.borough, AVG(sd.ell_percent) AS avg_ell_percent
FROM nyc_schools.high_school_directory AS hsd
LEFT JOIN nyc_schools.school_demographics AS sd
ON hsd.dbn = sd.dbn
GROUP BY hsd.borough
ORDER BY hsd.borough;
"""
# Explanation:
# Calculates the average percentage of English Language Learners (ELL) in each borough.
# LEFT JOIN ensures all schools from the directory are included, even if demographic data is missing.
# The AVG() function computes the mean ELL percentage per borough.

df_ell_avg_per_borough = pd.read_sql(sql_query_ell_avg_per_borough, conn)
print("\n===== Average % of English Language Learners (ELL) per borough: =====")
display(df_ell_avg_per_borough)


===== Average % of English Language Learners (ELL) per borough: =====


  df_ell_avg_per_borough = pd.read_sql(sql_query_ell_avg_per_borough, conn)


Unnamed: 0,borough,avg_ell_percent
0,Bronx,
1,Brooklyn,
2,Manhattan,7.5725
3,Queens,
4,Staten Island,


# =========================================================
## 4. TOP 3 SCHOOLS PER BOROUGH BY SPECIAL EDUCATION %
# =========================================================

In [None]:
# --- Query 3: Top 3 schools per borough with highest % of special education students (SPED) ---
sql_query_top3_sped = """
WITH school_sped AS (
    SELECT
        dbn,
        MAX(sped_percent) AS sped_percent
    FROM nyc_schools.school_demographics
    WHERE sped_percent IS NOT NULL
    GROUP BY dbn
),
ranked_schools AS (
    SELECT
        hs.borough,
        hs.dbn,
        hs.school_name,
        ss.sped_percent,
        ROW_NUMBER() OVER (
            PARTITION BY hs.borough
            ORDER BY ss.sped_percent DESC
        ) AS rank
    FROM school_sped ss
     JOIN nyc_schools.high_school_directory hs
      ON ss.dbn = hs.dbn
)
SELECT
    borough,
    dbn,
    school_name,
    sped_percent,
    rank
FROM ranked_schools
WHERE rank <= 3

ORDER BY borough, rank;
"""
# Explanation:
# 1. school_sped CTE: selects the maximum SPED percentage per school (dbn), ignoring NULLs.
# 2. ranked_schools CTE: joins the SPED data to the high school directory to get borough and school names,
#    and assigns a rank within each borough using ROW_NUMBER() ordered by SPED percentage descending.
# 3. Final SELECT: filters to the top 3 schools per borough based on SPED percentage.

df = pd.read_sql(sql_query_top3_sped, conn)
print("===== Top 3 schools per borough with highest % of special education students =====")
df

===== Top 3 schools per borough with highest % of special education students =====


  df = pd.read_sql(query, conn)


Unnamed: 0,borough,dbn,school_name,sped_percent,rank
0,Manhattan,01M450,East Side Community School,28.8,1
1,Manhattan,01M509,Marta Valle High School,25.9,2
2,Manhattan,01M292,Henry Street School for International Studies,25.1,3


# =========================================================
## 5. VERIFICATION OF RESULTS WITH PYTHON 
# =========================================================

In [150]:
# --- Load high school directory data ---
# - Execute SQL query and read the result into a pandas DataFrame
# - Standardize the 'dbn' column: remove leading/trailing spaces and convert to uppercase
sql_1 = """
SELECT *
FROM nyc_schools.high_school_directory 
"""
df_school_dir = pd.read_sql(sql_1, conn)
df_school_dir['dbn'] = df_school_dir['dbn'].str.strip().str.upper()


# --- Load school demographics data ---
# - Execute SQL query and read the result into a pandas DataFrame
# - Standardize the 'dbn' column for consistent merging
sql_2 = """
SELECT *
FROM nyc_schools.school_demographics
"""
df_school_dem = pd.read_sql(sql_2, conn)
df_school_dem['dbn'] = df_school_dem['dbn'].str.strip().str.upper()


# --- Merge the two datasets on 'dbn' ---
# A left join is used to keep all schools from the directory, even if demographic data is missing
df_full = pd.merge(df_school_dir, df_school_dem, on='dbn', how='left')


# --- Prepare data for Top 3 SPED schools per borough ---
# Select relevant columns and sort first by 'borough' (ascending) and then by 'sped_percent' (descending)
df_full_distr_sorted = df_full[['dbn', 'school_name', 'borough', 'sped_percent']]\
    .sort_values(by=['borough', 'sped_percent'], ascending=[True, False])

# Group by borough and take the top 3 schools per borough
df_top3_per_borough = df_full_distr_sorted.groupby('borough').head(3).reset_index(drop=True)

# Display the resulting DataFrame
display(df_top3_per_borough)

  df_school_dir = pd.read_sql(sql_1, conn)
  df_school_dem = pd.read_sql(sql_2, conn)


Unnamed: 0,dbn,school_name,borough,sped_percent
0,08X305,Pablo Neruda Academy,Bronx,
1,11X509,High School of Language and Innovation,Bronx,
2,08X348,Schuylerville Preparatory High School,Bronx,
3,21K559,Life Academy High School for Film and Music,Brooklyn,
4,16K393,Frederick Douglass Academy IV Secondary School,Brooklyn,
5,17K122,Pathways in Technology Early College High Scho...,Brooklyn,
6,01M450,East Side Community School,Manhattan,28.8
7,01M450,East Side Community School,Manhattan,27.7
8,01M450,East Side Community School,Manhattan,26.7
9,27Q260,Frederick Douglass Academy VI High School,Queens,


# =========================================================
# Key Insights from the Analysis of NYC High School Data
# =========================================================

An analysis of NYC high school data was conducted to examine the distribution of schools across boroughs, the average percentage of English Language Learners (ELL), and the top schools supporting special education students.

## 1. School Distribution by Borough

The number of distinct high schools per borough was counted using the high school directory:

| Borough         | Schools Count |
|-----------------|---------------|
| Bronx           | 118           |
| Brooklyn        | 121           |
| Manhattan       | 106           |
| Queens          | 80            |
| Staten Island   | 10            |

- Brooklyn has the largest number of high schools, while Staten Island has the fewest.  
- This distribution aligns with expectations based on population density and borough size.

## 2. Average % of English Language Learners (ELL) per Borough

| Borough         | Avg ELL % |
|-----------------|------------|
| Bronx           | NaN        |
| Brooklyn        | NaN        |
| Manhattan       | 7.57       |
| Queens          | NaN        |
| Staten Island   | NaN        |

- Only Manhattan returned a numeric average (7.57%).  
- All other boroughs returned `NaN`, even though the `dbn` values were verified and found to be consistent between tables.  
- This indicates that demographic data for ELL percentages may be missing or incomplete for these boroughs.

## 3. Top 3 Schools in Manhattan with Highest % of Special Education Students (SPED)

| Borough    | DBN     | School Name                                        | SPED % | Rank |
|------------|---------|---------------------------------------------------|--------|------|
| Manhattan  | 01M450  | East Side Community School                        | 28.8   | 1    |
| Manhattan  | 01M509  | Marta Valle High School                            | 25.9   | 2    |
| Manhattan  | 01M292  | Henry Street School for International Studies    | 25.1   | 3    |

- The three Manhattan schools show SPED percentages ranging from 25.1% to 28.8%.  
- The absence of schools from other boroughs suggests that either demographic data for these boroughs is missing or the `dbn` values recorded in the demographic dataset do not have corresponding entries in the `high_school_directory`.
  
## Key Insights and Recommendations

1. **Data Completeness Issue**  
   - Although `dbn` values are consistent, missing ELL and SPED data for other boroughs indicate incomplete coverage of demographic data in the dataset.

2. **Distribution Observations**  
   - Manhattan shows a moderate ELL percentage (~7.6%).  
   - The highest SPED percentages are observed in Manhattan schools.

3. **Next Steps**  
   - Investigate the absence of demographic data for Bronx, Brooklyn, Queens, and Staten Island.  
   - Verify that all schools in `school_demographics` have corresponding entries in `high_school_directory`.  
   - Once data coverage is confirmed, re-run aggregations to obtain complete borough-level insights.
