# Introduction
Analysis of representation of languages in NYC

# Setup

## Dependencies

In [None]:
%pip install pandas==2.3.1 pandasql==0.7.3

Collecting pandasql
  Downloading pandasql-0.7.3.tar.gz (26 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting sqlalchemy (from pandasql)
  Using cached sqlalchemy-2.0.41-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting typing-extensions>=4.6.0 (from sqlalchemy->pandasql)
  Using cached typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Using cached sqlalchemy-2.0.41-cp313-cp313-macosx_11_0_arm64.whl (2.1 MB)
Using cached typing_extensions-4.14.1-py3-none-any.whl (43 kB)
Building wheels for collected packages: pandasql
  Building wheel for pandasql (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pandasql: filename=pandasql-0.7.3-py3-none-any.whl size=26867 sha256=2e720bebb9c23eca0fb952e97913bb4f1788e27ceba26e253584723d3ef2f322
  Stored in directory: /Users/alex.bieniek/Library/Caches/pip/wheels/b4/d0/8c/a6b366870bf041849cd96e0

In [16]:
import pandas as pd
import pandasql as ps

## Data Collection
Save the following data dumps as described

- NYC OpenData
    - [LEP Presence by Language](https://data.cityofnewyork.us/City-Government/Population-and-Languages-of-the-Limited-English-Pr/ajin-gkbp/about_data) -> `data/lep.csv`

In [59]:
lep = pd.read_csv(
    'data/lep.csv',
    names=['years', 'borough', 'districtCode', 'district', 'language', 'lepPopulation', 'lepPercent', 'cvalepPopulation', 'cvalepPercent'],
    header=0,
    quotechar='"',
    delimiter=','
)
lep.Name = "lep"
lep.describe(include='all')

Unnamed: 0,years,borough,districtCode,district,language,lepPopulation,lepPercent,cvalepPopulation,cvalepPercent
count,8024,8024,8024.0,8024,8024,8024.0,8024.0,8024.0,8024.0
unique,1,5,,59,136,,,,
top,2015-2019,Brooklyn,,"Battery Park City, Tribeca",Afrikaans,,,,
freq,8024,2448,,136,59,,,,
mean,,,280.305085,,,221.569167,0.73507,111.514706,0.735456
std,,,117.76553,,,2122.544665,5.380983,992.67024,5.438624
min,,,101.0,,,0.0,0.0,0.0,0.0
25%,,,203.0,,,0.0,0.0,0.0,0.0
50%,,,306.0,,,0.0,0.0,0.0,0.0
75%,,,403.0,,,14.0,0.0,0.0,0.0


# Analysis

In [None]:
# Districts by Borough
# Note: districts mention multiple neighborhoods
districts_by_borough_df = ps.sqldf(f"""
    SELECT
        json_group_array(distinct json_object('district', district)) AS districts,
        borough
    FROM {lep.Name}
    GROUP BY borough
""")

import json
for _, districts, borough in districts_by_borough_df.to_records():
    print(
        f"{borough}: \n{'\n'.join("\t"+d['district'] for d in json.loads(districts))}"
    )

Bronx: 
	Melrose, Mott Haven, Port Morris
	Hunts Point, Longwood
	Morrisania, Crotona Park East
	Highbridge, Concourse Village
	University Hts., Fordham, Mt. Hope
	East Tremont, Belmont
	Bedford Park, Norwood, Fordham
	Riverdale, Kingsbridge, Marble Hill
	Soundview, Parkchester
	Throgs Nk., Co-op City, Pelham Bay
	Pelham Pkwy, Morris Park, Laconia
	Wakefield, Williamsbridge
Brooklyn: 
	Williamsburg, Greenpoint
	Brooklyn Heights, Fort Greene
	Bedford Stuyvesant
	Bushwick
	East New York, Starrett City
	Park Slope, Carroll Gardens
	Sunset Park, Windsor Terrace
	Crown Heights North
	Crown Heights South, Wingate
	Bay Ridge, Dyker Heights
	Bensonhurst, Bath Beach
	Borough Park, Ocean Parkway
	Coney Island, Brighton Beach
	Flatbush, Midwood
	Sheepshead Bay, Gerritsen Beach
	Brownsville, Ocean Hill
	East Flatbush, Rugby, Farragut
	Canarsie, Flatlands
Manhattan: 
	Battery Park City, Tribeca
	Greenwich Village, Soho
	Lower East Side, Chinatown
	Chelsea, Clinton
	Midtown Business District
	Stuyve

In [87]:
ps.sqldf(f"""
    SELECT language, SUM(cvalepPopulation)
    FROM {lep.Name}
    GROUP BY language
""")

Unnamed: 0,language,SUM(cvalepPopulation)
0,Afrikaans,0
1,Akan (incl. Twi),2661
2,Albanian,7048
3,Aleut languages,21
4,Amharic,516
...,...,...
131,Vietnamese,4553
132,Wolof,183
133,Yiddish,13360
134,Yoruba,1060
