In [1]:
from sklearn.neighbors import NearestNeighbors
from kneed import KneeLocator
import numpy as np
from plotly.subplots import make_subplots
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import functions as f
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
import geopandas as gpd
from pyproj import CRS
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt

In [2]:
# Define features want from dataset
demo_features = ['White','Black or African American','American Indian and Alaska Native', 'Asian'
                ,'Native Hawaiian and Other Pacific Islander','Some other race','Two or more races', 'Total population']

# Read dataset for data and drop where population = 0 and unwanted feature
demo_df = f.demo_data(demo_features,'Raw Data/acs_combined.xlsx')
demo_df.drop(['Name'],axis = 1,inplace = True)
demo_df.drop(list(demo_df[demo_df['Total population']==0].index),inplace = True)

# Feature Engineering
col = demo_df.columns
demo_df['% White'] = demo_df['White']/demo_df['Total population']*100
demo_df['% African American'] = demo_df['Black or African American']/demo_df['Total population']*100
demo_df['% Asian'] = demo_df['Asian']/demo_df['Total population']*100
demo_df['% Other'] = (demo_df['Native Hawaiian and Other Pacific Islander'] + demo_df['American Indian and Alaska Native'] + demo_df['Some other race'])/demo_df['Total population']*100
demo_df['% Mixed'] = demo_df['Two or more races']/demo_df['Total population']*100
demo_df.drop(col,axis =1, inplace = True)


In [18]:
#Read CSV for constructed dataset
cluster = pd.read_csv('Clustering/NTA Code Cluster Match.csv',index_col =0)

In [19]:
cluster

Unnamed: 0,Cluster
BK09,1
BK17,4
BK19,1
BK21,1
BK23,1
...,...
SI37,1
SI45,1
SI48,1
SI54,1


In [20]:
# Merge Cluster data and demo data
merge = demo_df.merge(cluster, how = 'outer',left_index = True, right_index = True)
merge.fillna(0,inplace = True)

In [21]:
#Separating the Clsuters form the DataFrame
cluster_1 = merge[merge['Cluster']==1]
cluster_2 = merge[merge['Cluster']==2]
cluster_3 = merge[merge['Cluster']==3]
cluster_4 = merge[merge['Cluster']==4]
cluster_5 = merge[merge['Cluster']==5]

In [22]:
cluster_1.describe()

Unnamed: 0,% White,% African American,% Asian,% Other,% Mixed,Cluster
count,120.0,120.0,120.0,120.0,120.0,120.0
mean,46.842409,24.360222,11.535634,12.932165,2.662904,1.0
std,29.718659,27.477597,13.531677,14.512525,1.769278,0.0
min,0.0,0.0,0.0,0.0,0.0,1.0
25%,19.78402,2.760991,2.300021,2.74855,1.346084,1.0
50%,46.522356,10.833493,6.49811,6.795341,2.255161,1.0
75%,75.257034,35.434156,14.647613,16.776466,3.295574,1.0
max,97.653826,93.842727,63.590452,56.775747,11.344097,1.0


In [23]:
cluster_2.describe()

Unnamed: 0,% White,% African American,% Asian,% Other,% Mixed,Cluster
count,23.0,23.0,23.0,23.0,23.0,23.0
mean,24.712611,41.950296,8.285721,21.602274,3.449098,2.0
std,14.539614,23.876558,10.557888,14.777549,1.331435,0.0
min,5.796578,10.466418,0.607309,5.654423,1.381095,2.0
25%,12.447405,22.910751,1.970077,9.277095,2.265795,2.0
50%,21.18581,38.565056,2.856473,14.560265,3.424423,2.0
75%,34.827864,64.671968,9.302106,31.520121,4.359635,2.0
max,56.645069,82.774489,32.237862,48.611138,5.672745,2.0


In [24]:
cluster_3.describe()

Unnamed: 0,% White,% African American,% Asian,% Other,% Mixed,Cluster
count,6.0,6.0,6.0,6.0,6.0,6.0
mean,67.506142,16.783756,8.724388,4.130136,2.855579,3.0
std,28.163229,29.647074,7.462349,3.389564,0.930035,0.0
min,12.416858,1.352959,2.125237,0.812455,1.580719,3.0
25%,70.0447,2.009012,5.62058,1.922317,2.154479,3.0
50%,73.979782,4.639104,6.44437,2.868817,3.076771,3.0
75%,82.265648,10.27593,8.108544,5.976844,3.487415,3.0
max,90.89982,76.784181,23.323098,9.611056,3.927866,3.0


In [25]:
cluster_4.describe()

Unnamed: 0,% White,% African American,% Asian,% Other,% Mixed,Cluster
count,39.0,39.0,39.0,39.0,39.0,39.0
mean,44.414392,18.033459,17.052044,17.464636,3.03547,4.0
std,24.438281,23.837896,16.513112,16.034275,2.594918,0.0
min,2.825942,0.848375,0.595062,0.965016,0.358747,4.0
25%,23.593897,2.51522,4.196289,5.098455,1.75528,4.0
50%,43.665305,6.153693,11.916755,9.353694,2.347884,4.0
75%,64.14127,23.705591,24.36116,28.932269,3.299367,4.0
max,83.507132,89.805863,66.897754,54.667148,16.2148,4.0


In [26]:
cluster_5.describe()

Unnamed: 0,% White,% African American,% Asian,% Other,% Mixed,Cluster
count,7.0,7.0,7.0,7.0,7.0,7.0
mean,65.272292,5.702025,20.024433,5.875695,3.125556,5.0
std,19.838995,1.115743,20.027485,2.51813,0.782666,0.0
min,20.827068,4.401719,4.445409,2.245997,2.433942,5.0
25%,69.332807,5.069623,12.715136,3.928962,2.74563,5.0
50%,72.81225,5.654439,13.605553,6.377676,2.795097,5.0
75%,73.074378,5.972916,16.08711,8.05129,3.212018,5.0
max,78.452355,7.772938,64.515575,8.545691,4.734556,5.0
