# Analysis of US Demographics From 2010 to 2020

In [None]:
#IMPORT THESE IMPORTANT MODULES
import matplotlib.pylab as plt
import plotly.express as px
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import utilities
%load_ext autoreload
%autoreload 2

## Reading in the Datasets and Turning Them into Dataframes

#### Income

In [None]:
infilename = "Data Sets/2000 Income Cleaned.csv"
df2000Income = utilities.parse_csv_file_to_dataframe(infilename, filetype='income', fileyear='2000')

print('2000 Income Dataframe')
#df2000Income

In [None]:
infilename = "Data Sets/2010 Income Cleaned.csv"
df2010Income = utilities.parse_csv_file_to_dataframe(infilename, filetype='income', fileyear='2010')

print('2010 Income Dataframe')
#df2010Income

In [None]:
infilename = "Data Sets/2020 Income Cleaned.csv"
df2020Income = utilities.parse_csv_file_to_dataframe(infilename, filetype='income', fileyear='2020')

print('2020 Income Dataframe')
#df2020Income

#### Population and Demographics

In [None]:
infilename = "Data Sets/2000 Population Cleaned.csv"
df2000Demographics = utilities.parse_csv_file_to_dataframe(infilename, filetype='demographics', fileyear='2000')

print('2000 Demographics Dataframe')
#df2000Demographics

In [None]:
infilename = "Data Sets/2010 Population and Demographics Cleaned.csv"
df2010Demographics = utilities.parse_csv_file_to_dataframe(infilename, filetype='demographics', fileyear='2010')

print('2010 Demographics Dataframe')
#df2010Demographics

In [None]:
infilename = "Data Sets/2020 Population and Demographics Cleaned.csv"
df2020Demographics = utilities.parse_csv_file_to_dataframe(infilename, filetype='demographics', fileyear='2020')

print('2020 Demographics Dataframe')
#df2020Demographics

#### Removing Non-States from the Dataset

In [None]:
#Drop Non-States into a Mask
mask = (df2000Income['State'] == 'DC') | (df2000Income['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2000Income = df2000Income[~mask]
#df2000Income

#Drop Non-States into a Mask
mask = (df2010Income['State'] == 'DC') | (df2010Income['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2010Income = df2010Income[~mask]
#df2010Income

#Drop Non-States into a Mask
mask = (df2020Income['State'] == 'DC') | (df2020Income['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2020Income = df2020Income[~mask]
#df2020Income

#Drop Non-States into a Mask
mask = (df2000Demographics['State'] == 'DC') | (df2000Demographics['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2000Demographics = df2000Demographics[~mask]
#df2000Demographics

#Drop Non-States into a Mask
mask = (df2010Demographics['State'] == 'DC') | (df2010Demographics['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2010Demographics = df2010Demographics[~mask]
#df2010Demographics

#Drop Non-States into a Mask
mask = (df2020Demographics['State'] == 'DC') | (df2020Demographics['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2020Demographics = df2020Demographics[~mask]
#df2020Demographics

## Graphs of Features by Year

### Making a New Dataframe With the Calculated Differences and Fractions

In [None]:
#New Dataframe with State Names Copied Over
dfnew = df2010Demographics[['State']].copy()

In [None]:
#Adding all of the calcualted values to the new Dataframe dfnew
#Total Population
a = df2010Demographics['Population']
b = df2020Demographics['Population']
c = b - a
dfnew['Population Change'] = c
dfnew['Population Change Fraction'] = c/a

dfnew['Population 2010'] = a
dfnew['Population 2020'] = b
#Number of Males
d = df2010Demographics['Male']
e = df2020Demographics['Male']
f = e - d
dfnew['Male Change'] = f
dfnew['Male Change Fraction'] = f/d

dfnew['Male 2010'] = d
dfnew['Male 2020'] = e

dfnew['Male Fraction 2010'] = d/a
dfnew['Male Fraction 2020'] = e/b

#Number of Females
g = df2010Demographics['Female']
h = df2020Demographics['Female']
i = h - g
dfnew['Female Change'] = i
dfnew['Female Change Fraction'] = i/g

dfnew['Female 2010'] = g
dfnew['Female 2020'] = h

dfnew['Female Fraction 2010'] = g/a
dfnew['Female Fraction 2020'] = h/b



dfnew['Male/Female 2010'] = d/g
dfnew['Male/Female 2020'] = e/h

dfnew['Male/Female Change'] = (e/h) - (d/g)

#Median Income per Household
j = df2010Income['Median']
k = df2020Income['Median']
l = k - j
dfnew['Median Income Change'] = l
dfnew['Median Income Change Fraction'] = l/j

dfnew['Median Income 2010'] = j
dfnew['Median Income 2020'] = k

#Mean Income per Household
m = df2010Income['Mean']
n = df2020Income['Mean']
o = n - m
dfnew['Mean Income Change'] = o
dfnew['Mean Income Change Fraction'] = o/m

dfnew['Mean Income 2010'] = m
dfnew['Mean Income 2020'] = n

#Number of White as Only Race
p = df2010Demographics['White']
q = df2020Demographics['White']
r = q - p
dfnew['White Change'] = r
dfnew['White Change Fraction'] = r/p

dfnew['White 2010'] = p
dfnew['White 2020'] = q

dfnew['White Fraction 2010'] = p/a
dfnew['White Fraction 2020'] = q/b



#Number of Black or African American as Only Race
t = df2010Demographics['Black or African American']
u = df2020Demographics['Black or African American']
v = u - t
dfnew['Black or African American Change'] = v
dfnew['Black or African American Change Fraction'] = v/t

dfnew['Black 2010'] = t
dfnew['Black 2020'] = u

dfnew['Black Fraction 2010'] = t/a
dfnew['Black Fraction 2020'] = u/b



dfnew['White/Black 2010'] = p/t
dfnew['White/Black 2020'] = q/u

dfnew['White/Black Change'] = (q/u) - (p/t)

#Printing out a snipit of the new dataframe that was created
dfnew[0:10]

### Creating the Graphs

#### Population

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Population Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Population Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Population Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Population Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.grid()

#### Sex (Female, Male)

##### Male

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Male Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Male Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Male Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Male Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Male Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Male Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.grid()

##### Female

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Female Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Female Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Female Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Female Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Female Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Female Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.grid()

##### Male/Female Same Graph

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Male Change','Female Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Male Change',ax=plt.gca(), color='blue', alpha=0.8, align='edge')
dfnew.plot.bar(x='State', y='Female Change', ax=plt.gca(), color='pink', alpha=1.0, align='center')
plt.title("2010 vs 2020 U.S. Male and Female Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.legend(fontsize=24)
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Male Change Fraction','Female Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Male Change Fraction',ax=plt.gca(), color='blue', alpha=0.8, align='edge')
dfnew.plot.bar(x='State', y='Female Change Fraction', ax=plt.gca(), color='pink', alpha=1.0, align='center')
plt.title("2010 vs 2020 U.S. Male and Female Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
dfnew = dfnew.sort_values(by=['Male/Female 2010','Male/Female 2020'], ascending=True)


plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State', y='Male/Female 2010',ax=plt.gca(), color='black', alpha=1.0, align='center')
dfnew.plot.bar(x='State', y='Male/Female 2020', ax=plt.gca(), color='red', alpha=1.0, align='edge')
plt.title("2010 vs 2020 U.S. Male/Female Ratio", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.legend(fontsize=24)
plt.ylim(0.85)
plt.axhline(y=1.0, color='k', linestyle='--', linewidth=3)
plt.grid()

# Black --> Red, increases --> % of men increased
# Black --> Red, decreases --> % of women increased

#### Median Income

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Median Income Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Median Income Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Median Income Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Median Income Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Median Income Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Median Income Change (Fraction)", fontsize=20, fontweight ='bold')
plt.axhline(y=0.19, color='r', linewidth=3)
plt.grid()

#### Mean Income

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Mean Income Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Mean Income Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Mean Income Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Mean Income Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Mean Income Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Mean Income Change (Fraction)", fontsize=20, fontweight ='bold')
plt.axhline(y=0.19, color='r', linewidth=3)
plt.grid()

#### Racial (Black or African American, White)

##### Black or African American

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Black or African American Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Black or African American Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Black or African American Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Black or African American Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Black or African American Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Black or African American Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.grid()

##### White

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['White Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='White Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. White Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['White Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='White Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. White Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.grid()

##### White/Black Same Graph

In [None]:
dfnew = dfnew.sort_values(by=['White/Black 2020','White/Black 2010'], ascending=True)


plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State', y='White/Black 2020', ax=plt.gca(), color='black', alpha=1.0, align='center')
dfnew.plot.bar(x='State', y='White/Black 2010',ax=plt.gca(), color='red', alpha=1.0, align='edge')
plt.title("2010 vs 2020 U.S. White/Black Ratio", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.legend(fontsize=24)

plt.yscale('log')
plt.yticks([2, 4, 6, 8, 10, 20, 40, 60, 80, 100])#, fontsize=18)
#plt.locator_params(axis='y', bins=10)
plt.gca().get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
plt.ylabel('Ratio of # of White People to # of Black People',fontsize=18)
plt.grid()

## State Clustering

### 2020 Clustering Graph

In [None]:
# Use only some

features = ['Male/Female 2020', 'Black Fraction 2020', 'White Fraction 2020', 'Median Income 2020', 'Mean Income 2020']

print(features)

In [None]:
# Use only some
plt.figure(figsize=(24,24))

g = sns.PairGrid(dfnew[features])

g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
X = dfnew[features].values

In [None]:
# Create the model with some number of clusters
# Choose the initial centroids at random
# Run it with 10 different centroid seeds
# Set the random seed to something deterministic (0 in this case)
model = KMeans(n_clusters=3, init='random', n_init=10, random_state=0)

# Run the clustering algorithm! 
labels_km = model.fit_predict(X)

In [None]:
labels_km

In [None]:
print(model.cluster_centers_)

In [None]:
plt.figure(figsize=(10,8))

idx0 = 2
idx1 = 4

label0 = features[idx0]
label1 = features[idx1]


# Plot the 3 clusters, making use of masking to display the first
# two features (the 0th and 1st column), plotted versus each other
plt.scatter(
    X[labels_km == 0, idx0], X[labels_km == 0, idx1],
    s=100, c='blue',
    marker='s', edgecolor='black',
    label='cluster 1'
)

plt.scatter(
    X[labels_km == 1, idx0], X[labels_km == 1, idx1],
    s=100, c='orange',
    marker='o', edgecolor='black',
    label='cluster 2'
)

plt.scatter(
    X[labels_km == 2, idx0], X[labels_km == 2, idx1],
    s=100, c='green',
    marker='v', edgecolor='black',
    label='cluster 3'
)

# Plot the centroids
plt.scatter(
    model.cluster_centers_[:, idx0], model.cluster_centers_[:, idx1],
    s=750, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.xlabel(label0,fontsize=18)
plt.ylabel(label1, fontsize=18)
plt.legend(scatterpoints=1, fontsize=20)
plt.grid()

In [None]:
labels_km
states = dfnew['State']

vals = np.unique(labels_km)
print(vals)

for v in vals:
    print(f"{v} ------\n")
    print(states[labels_km==v].values)

#print(states)

In [None]:
dfnewMap = dfnew.copy(deep=True)
dfnewMap['KM Labels'] = labels_km
#dfnewMap

In [None]:
fig = px.choropleth(dfnewMap,
                    locations='State', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='KM Labels',
                    color_continuous_scale=["blue", "orange", "green"])
fig.update_layout(
      title_text = '2020 State Clustering (Mean Income by White Fraction)',
      title_font_size = 20,
      title_font_color="black", 
      title_x=0.45)
fig.show()

### 2010 - 2020 Change Clustering Graph (Not Using)

In [None]:
#features = dfnew.columns.values
features = ['Population Change', 'Population Change Fraction', 'Male Change', 'Male Change Fraction',
           'Female Change', 'Female Change Fraction', 'Median Income Change', 'Median Income Change Fraction',
           'Mean Income Change', 'Mean Income Change Fraction', 'White Change', 'White Change Fraction',
           'Black or African American Change', 'Black or African American Change Fraction']

print(features)

In [None]:
# Use only some
plt.figure(figsize=(24,24))

g = sns.PairGrid(dfnew[features])

g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
X = dfnew[features].values

In [None]:
# Create the model with some number of clusters
# Choose the initial centroids at random
# Run it with 10 different centroid seeds
# Set the random seed to something deterministic (0 in this case)
model = KMeans(n_clusters=3, init='random', n_init=10, random_state=0)

# Run the clustering algorithm! 
labels_km = model.fit_predict(X)

In [None]:
labels_km

In [None]:
print(model.cluster_centers_)

In [None]:
features[11]

In [None]:
plt.figure(figsize=(10,8))

idx0 = 1
idx1 = 11

label0 = features[idx0]
label1 = features[idx1]


# Plot the 3 clusters, making use of masking to display the first
# two features (the 0th and 1st column), plotted versus each other
plt.scatter(
    X[labels_km == 0, idx0], X[labels_km == 0, idx1],
    s=100, c='blue',
    marker='s', edgecolor='black',
    label='cluster 1'
)

plt.scatter(
    X[labels_km == 1, idx0], X[labels_km == 1, idx1],
    s=100, c='orange',
    marker='o', edgecolor='black',
    label='cluster 2'
)

plt.scatter(
    X[labels_km == 2, idx0], X[labels_km == 2, idx1],
    s=100, c='green',
    marker='v', edgecolor='black',
    label='cluster 3'
)

# Plot the centroids
plt.scatter(
    model.cluster_centers_[:, idx0], model.cluster_centers_[:, idx1],
    s=750, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.xlabel(label0,fontsize=18)
plt.ylabel(label1, fontsize=18)
plt.legend(scatterpoints=1, fontsize=20)
plt.grid()

In [None]:
labels_km
states = dfnew['State']

vals = np.unique(labels_km)
print(vals)

for v in vals:
    print(f"{v} ------\n")
    print(states[labels_km==v].values)

#print(states)

In [None]:
dfnewMap = dfnew.copy(deep=True)
dfnewMap['KM Labels'] = labels_km
#dfnewMap

In [None]:
fig = px.choropleth(dfnewMap,
                    locations='State', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='KM Labels',
                    color_continuous_scale=["blue", "orange", "green"])
fig.update_layout(
      title_text = '2010-2020 State Change Clustering (???)',
      title_font_size = 20,
      title_font_color="black", 
      title_x=0.45)
fig.show()

### 3-D Graph if Wanted

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import plotly.express as px

X_std = StandardScaler().fit_transform(dfnew[features])

km = KMeans(
    n_clusters=3, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)
X = StandardScaler().fit_transform(dfnew[features])
y_km = km.fit_predict(X)
fig = px.scatter_3d(X_std, x=1, y=11, z=2, color=y_km)
fig.show()

### 2010-2020 Change With Less Features

In [None]:
#features = dfnew.columns.values
features = ['Population Change Fraction', 'Mean Income Change', 'Mean Income Change Fraction']

print(features)

In [None]:
# Use only some
plt.figure(figsize=(24,24))

g = sns.PairGrid(dfnew[features])

g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
X = dfnew[features].values

In [None]:
# Create the model with some number of clusters
# Choose the initial centroids at random
# Run it with 10 different centroid seeds
# Set the random seed to something deterministic (0 in this case)
model = KMeans(n_clusters=3, init='random', n_init=10, random_state=0)

# Run the clustering algorithm! 
labels_km = model.fit_predict(X)

In [None]:
labels_km

In [None]:
plt.figure(figsize=(10,8))

idx0 = 0
idx1 = 1

label0 = features[idx0]
label1 = features[idx1]


# Plot the 3 clusters, making use of masking to display the first
# two features (the 0th and 1st column), plotted versus each other
plt.scatter(
    X[labels_km == 0, idx0], X[labels_km == 0, idx1],
    s=100, c='blue',
    marker='s', edgecolor='black',
    label='cluster 1'
)

plt.scatter(
    X[labels_km == 1, idx0], X[labels_km == 1, idx1],
    s=100, c='orange',
    marker='o', edgecolor='black',
    label='cluster 2'
)

plt.scatter(
    X[labels_km == 2, idx0], X[labels_km == 2, idx1],
    s=100, c='green',
    marker='v', edgecolor='black',
    label='cluster 3'
)

# Plot the centroids
plt.scatter(
    model.cluster_centers_[:, idx0], model.cluster_centers_[:, idx1],
    s=750, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.xlabel(label0,fontsize=18)
plt.ylabel(label1, fontsize=18)
plt.legend(scatterpoints=1, fontsize=15, loc='upper left')
plt.grid()

In [None]:
labels_km
states = dfnew['State']

vals = np.unique(labels_km)
print(vals)

for v in vals:
    print(f"{v} ------\n")
    print(states[labels_km==v].values)

#print(states)

In [None]:
dfnewMap = dfnew.copy(deep=True)
dfnewMap['KM Labels'] = labels_km
#dfnewMap

In [None]:
fig = px.choropleth(dfnewMap,
                    locations='State', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='KM Labels',
                    color_continuous_scale=["blue", "orange", "green"])
fig.update_layout(
      title_text = '2010-2020 (Mean Income Change by Pop. Change Frac.)',
      title_font_size = 20,
      title_font_color="black", 
      title_x=0.45)
fig.show()

### Other Clustering

In [None]:
# Create the model with some number of clusters
# Try 2 clusters
model2 = KMeans(n_clusters=2, init='random', n_init=10, random_state=0)

# Run the clustering algorithm! 
labels_km2 = model2.fit_predict(X)

plt.figure(figsize=(10,8))

# Plot the clusters, making use of masking to display the first
# two features (the 0th and 1st column), plotted versus each other

for i in range(0,2):

  plt.scatter(
      X[labels_km2 == i, 0], X[labels_km2 == i, 1],
      s=100,
      marker='s', edgecolor='black',
      label=f'cluster {i+1}'
  )


# Plot the centroids
plt.scatter(
    model2.cluster_centers_[:, 0], model2.cluster_centers_[:, 1],
    s=750, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.xlabel('Sepal length',fontsize=18)
plt.ylabel('Sepal width', fontsize=18)
plt.legend(scatterpoints=1)
plt.grid()


pd.crosstab(dfnew['State'], labels_km2)

In [None]:
# Create the model with some number of clusters
# Try 6 clusters
model3 = KMeans(n_clusters=6, init='random', n_init=10, random_state=0)

# Run the clustering algorithm! 
labels_km3 = model3.fit_predict(X)

plt.figure(figsize=(10,8))

# Plot the clusters, making use of masking to display the first
# two features (the 0th and 1st column), plotted versus each other

for i in range(0,6):
  plt.scatter(
      X[labels_km3 == i, 0], X[labels_km3 == i, 1],
      s=100,
      marker='s', edgecolor='black',
      label=f'cluster {i+1}'
  )

# Plot the centroids
plt.scatter(
    model3.cluster_centers_[:, 0], model3.cluster_centers_[:, 1],
    s=750, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.xlabel('Sepal length',fontsize=18)
plt.ylabel('Sepal width', fontsize=18)
plt.legend(scatterpoints=1)
plt.grid()


pd.crosstab(dfnew['State'], labels_km3)

In [None]:
print(model.inertia_)
print(model2.inertia_)
print(model3.inertia_)

In [None]:
inertias = []
nclusters = []

# Try from 1 to 10 clusters
for i in range(1,11):
  model_temp = KMeans(n_clusters=i, init='random', n_init=10, random_state=0)
  model_temp.fit(X)

  nclusters.append(i)
  inertias.append(model_temp.inertia_)

plt.figure(figsize=(8,6))
plt.plot(nclusters, inertias)
plt.xlabel('# of clusters', fontsize=18)
plt.ylabel('inertia', fontsize=18)

## Specific State Graphs

#### PA/AZ: Percentage(%)

In [None]:
# Getting the Arizona Data
AZdf2020 = df2020Demographics.loc[df2020Demographics['State'] == 'AZ']
AZ2020Ages = AZdf2020[['Under 5 yrs', '5-9 yrs', '10-14 yrs', '15-19 yrs', '20-24 yrs', '25-34 yrs', 
                       '35-44 yrs', '45-54 yrs', '55-59 yrs', '60-64 yrs', '65-74 yrs', '75-84 yrs', '85 yrs and Older']]
# Getting the Pennsylvania Data
PAdf2020 = df2020Demographics.loc[df2020Demographics['State'] == 'PA']
PA2020Ages = PAdf2020[['Under 5 yrs', '5-9 yrs', '10-14 yrs', '15-19 yrs', '20-24 yrs', '25-34 yrs', 
                       '35-44 yrs', '45-54 yrs', '55-59 yrs', '60-64 yrs', '65-74 yrs', '75-84 yrs', '85 yrs and Older']]
# Adding the 2 dataframes together
AZPAdf2020 = pd.concat([AZ2020Ages, PA2020Ages], ignore_index = True)
AZPAdf2020 = AZPAdf2020.transpose()
#AZPAdf2020

# Getting the sum of each State
df2 = AZPAdf2020.sum()
AZMean = df2[0]
PAMean = df2[1]
#print(AZMean)
#print(PAMean)

# Transposing the dataframe again
AZPAdf2020 = AZPAdf2020.transpose()
#AZPAdf2020

# ARIZONA TO %
# Gets all of the percentages for AZ Age Groups
Group1 = AZdf2020['Under 5 yrs'].div(AZMean)
Group2 = AZdf2020['5-9 yrs'].div(AZMean)
Group3 = AZdf2020['10-14 yrs'].div(AZMean)
Group4 = AZdf2020['15-19 yrs'].div(AZMean)
Group5 = AZdf2020['20-24 yrs'].div(AZMean)
Group6 = AZdf2020['25-34 yrs'].div(AZMean)
Group7 = AZdf2020['35-44 yrs'].div(AZMean)
Group8 = AZdf2020['45-54 yrs'].div(AZMean)
Group9 = AZdf2020['55-59 yrs'].div(AZMean)
Group10 = AZdf2020['60-64 yrs'].div(AZMean)
Group11 = AZdf2020['65-74 yrs'].div(AZMean)
Group12 = AZdf2020['75-84 yrs'].div(AZMean)
Group13 = AZdf2020['85 yrs and Older'].div(AZMean)
# Replaces AZ nmumbers with percentages
AZPAdf2020.at[0,'Under 5 yrs']=Group1
AZPAdf2020.at[0,'5-9 yrs']=Group2
AZPAdf2020.at[0,'10-14 yrs']=Group3
AZPAdf2020.at[0,'15-19 yrs']=Group4
AZPAdf2020.at[0,'20-24 yrs']=Group5
AZPAdf2020.at[0,'25-34 yrs']=Group6
AZPAdf2020.at[0,'35-44 yrs']=Group7
AZPAdf2020.at[0,'45-54 yrs']=Group8
AZPAdf2020.at[0,'55-59 yrs']=Group9
AZPAdf2020.at[0,'60-64 yrs']=Group10
AZPAdf2020.at[0,'65-74 yrs']=Group11
AZPAdf2020.at[0,'75-84 yrs']=Group12
AZPAdf2020.at[0,'85 yrs and Older']=Group13

#PENNSYLVANIA TO %
# Gets all of the percentages for PA Age Groups
Group1 = PAdf2020['Under 5 yrs'].div(PAMean)
Group2 = PAdf2020['5-9 yrs'].div(PAMean)
Group3 = PAdf2020['10-14 yrs'].div(PAMean)
Group4 = PAdf2020['15-19 yrs'].div(PAMean)
Group5 = PAdf2020['20-24 yrs'].div(PAMean)
Group6 = PAdf2020['25-34 yrs'].div(PAMean)
Group7 = PAdf2020['35-44 yrs'].div(PAMean)
Group8 = PAdf2020['45-54 yrs'].div(PAMean)
Group9 = PAdf2020['55-59 yrs'].div(PAMean)
Group10 = PAdf2020['60-64 yrs'].div(PAMean)
Group11 = PAdf2020['65-74 yrs'].div(PAMean)
Group12 = PAdf2020['75-84 yrs'].div(PAMean)
Group13 = PAdf2020['85 yrs and Older'].div(PAMean)
# Replaces PA nmumbers with percentages
AZPAdf2020.at[1,'Under 5 yrs']=Group1
AZPAdf2020.at[1,'5-9 yrs']=Group2
AZPAdf2020.at[1,'10-14 yrs']=Group3
AZPAdf2020.at[1,'15-19 yrs']=Group4
AZPAdf2020.at[1,'20-24 yrs']=Group5
AZPAdf2020.at[1,'25-34 yrs']=Group6
AZPAdf2020.at[1,'35-44 yrs']=Group7
AZPAdf2020.at[1,'45-54 yrs']=Group8
AZPAdf2020.at[1,'55-59 yrs']=Group9
AZPAdf2020.at[1,'60-64 yrs']=Group10
AZPAdf2020.at[1,'65-74 yrs']=Group11
AZPAdf2020.at[1,'75-84 yrs']=Group12
AZPAdf2020.at[1,'85 yrs and Older']=Group13

# Transpose the data one last time
AZPAdf2020 = AZPAdf2020.transpose()
AZPAdf2020

In [None]:
# set width of bar
barWidth = 0.35
fig = plt.subplots(figsize =(20, 10))
 
# set the values for each bar
StateAZ = AZPAdf2020[0]
StatePA = AZPAdf2020[1]


 
# Set position of bar on X axis
br1 = np.arange(len(AZPAdf2020))
br2 = [x + barWidth for x in br1]
 
# Make the plot
plt.bar(br1, StateAZ, color ='red', width = barWidth,
         edgecolor ='grey', label ='Arizona')
plt.bar(br2, StatePA, color ='black', width = barWidth,
        edgecolor ='grey', label ='Pennsylvania')
 
# Adding Xticks
plt.title('Arizona vs Pennsylvania Ages by Range (%)', fontweight ='bold', fontsize = 20)
plt.xlabel('Age Range', fontsize = 15)
plt.ylabel('% Number of People', fontsize = 15)
plt.xticks([r + barWidth for r in range(len(AZPAdf2020))],
        ['Under 5 yrs', '5-9 yrs', '10-14 yrs', '15-19 yrs', '20-24 yrs', '25-34 yrs', 
         '35-44 yrs', '45-54 yrs', '55-59 yrs', '60-64 yrs', '65-74 yrs', '75-84 yrs', '85 yrs and Older'])
 
plt.legend(fontsize=20)
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.show()

#### AZ/PA: Actual Numbers

In [None]:
# Getting the Arizona Data
AZdf2020 = df2020Demographics.loc[df2020Demographics['State'] == 'AZ']
AZ2020Ages = AZdf2020[['Under 5 yrs', '5-9 yrs', '10-14 yrs', '15-19 yrs', '20-24 yrs', '25-34 yrs', 
                       '35-44 yrs', '45-54 yrs', '55-59 yrs', '60-64 yrs', '65-74 yrs', '75-84 yrs', '85 yrs and Older']]

# Getting the Pennsylvania Data
PAdf2020 = df2020Demographics.loc[df2020Demographics['State'] == 'PA']
PA2020Ages = PAdf2020[['Under 5 yrs', '5-9 yrs', '10-14 yrs', '15-19 yrs', '20-24 yrs', '25-34 yrs', 
                       '35-44 yrs', '45-54 yrs', '55-59 yrs', '60-64 yrs', '65-74 yrs', '75-84 yrs', '85 yrs and Older']]

AZPAdf2020 = pd.concat([AZ2020Ages, PA2020Ages], ignore_index = True)
AZPAdf2020 = AZPAdf2020.transpose()
AZPAdf2020 = AZPAdf2020.astype(int)
AZPAdf2020

In [None]:
df2 = AZPAdf2020.sum()
AZMean = df2[0]
PAMean = df2[1]

Group1 = AZdf2020['Under 5 yrs'].div(AZMean)
print(Group1)
#AZPAdf2020.loc[AZPAdf2020['Under 5 yrs']] = Group1
AZPAdf2020

In [None]:
# set width of bar
barWidth = 0.35
fig = plt.subplots(figsize =(20, 10))
 
# set the values for each bar
StateAZ = AZPAdf2020[0]
StatePA = AZPAdf2020[1]


 
# Set position of bar on X axis
br1 = np.arange(len(AZPAdf2020))
br2 = [x + barWidth for x in br1]
 
# Make the plot
plt.bar(br1, StateAZ, color ='teal', width = barWidth,
         edgecolor ='grey', label ='Arizona')
plt.bar(br2, StatePA, color ='purple', width = barWidth,
        edgecolor ='grey', label ='Pennsylvania')
 
# Adding Xticks
plt.title('AZ vs PA Ages by Range', fontweight ='bold', fontsize = 20)
plt.xlabel('Age Range', fontsize = 15)
plt.ylabel('Number of People', fontsize = 15)
plt.xticks([r + barWidth for r in range(len(AZPAdf2020))],
        ['Under 5 yrs', '5-9 yrs', '10-14 yrs', '15-19 yrs', '20-24 yrs', '25-34 yrs', 
         '35-44 yrs', '45-54 yrs', '55-59 yrs', '60-64 yrs', '65-74 yrs', '75-84 yrs', '85 yrs and Older'])
 
plt.legend(fontsize=20)
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.show()

#### LA/ME: Percentages(%)

In [None]:
# Getting the Arizona Data
LAdf2020 = df2020Demographics.loc[df2020Demographics['State'] == 'LA']
LA2020Ages = LAdf2020[['Under 5 yrs', '5-9 yrs', '10-14 yrs', '15-19 yrs', '20-24 yrs', '25-34 yrs', 
                       '35-44 yrs', '45-54 yrs', '55-59 yrs', '60-64 yrs', '65-74 yrs', '75-84 yrs', '85 yrs and Older']]
# Getting the Pennsylvania Data
MEdf2020 = df2020Demographics.loc[df2020Demographics['State'] == 'ME']
ME2020Ages = MEdf2020[['Under 5 yrs', '5-9 yrs', '10-14 yrs', '15-19 yrs', '20-24 yrs', '25-34 yrs', 
                       '35-44 yrs', '45-54 yrs', '55-59 yrs', '60-64 yrs', '65-74 yrs', '75-84 yrs', '85 yrs and Older']]
# Adding the 2 dataframes together
LAMEdf2020 = pd.concat([LA2020Ages, ME2020Ages], ignore_index = True)
LAMEdf2020 = LAMEdf2020.transpose()
#LAMEdf2020

# Getting the sum of each State
df2 = LAMEdf2020.sum()
LAMean = df2[0]
MEMean = df2[1]
#print(LAMean)
#print(MEMean)

# Transposing the dataframe again
LAMEdf2020 = LAMEdf2020.transpose()
#LAMEdf2020

# ARIZONA TO %
# Gets all of the percentages for LA Age Groups
Group1 = LAdf2020['Under 5 yrs'].div(LAMean)
Group2 = LAdf2020['5-9 yrs'].div(LAMean)
Group3 = LAdf2020['10-14 yrs'].div(LAMean)
Group4 = LAdf2020['15-19 yrs'].div(LAMean)
Group5 = LAdf2020['20-24 yrs'].div(LAMean)
Group6 = LAdf2020['25-34 yrs'].div(LAMean)
Group7 = LAdf2020['35-44 yrs'].div(LAMean)
Group8 = LAdf2020['45-54 yrs'].div(LAMean)
Group9 = LAdf2020['55-59 yrs'].div(LAMean)
Group10 = LAdf2020['60-64 yrs'].div(LAMean)
Group11 = LAdf2020['65-74 yrs'].div(LAMean)
Group12 = LAdf2020['75-84 yrs'].div(LAMean)
Group13 = LAdf2020['85 yrs and Older'].div(LAMean)
# Replaces LA nmumbers with percentages
LAMEdf2020.at[0,'Under 5 yrs']=Group1
LAMEdf2020.at[0,'5-9 yrs']=Group2
LAMEdf2020.at[0,'10-14 yrs']=Group3
LAMEdf2020.at[0,'15-19 yrs']=Group4
LAMEdf2020.at[0,'20-24 yrs']=Group5
LAMEdf2020.at[0,'25-34 yrs']=Group6
LAMEdf2020.at[0,'35-44 yrs']=Group7
LAMEdf2020.at[0,'45-54 yrs']=Group8
LAMEdf2020.at[0,'55-59 yrs']=Group9
LAMEdf2020.at[0,'60-64 yrs']=Group10
LAMEdf2020.at[0,'65-74 yrs']=Group11
LAMEdf2020.at[0,'75-84 yrs']=Group12
LAMEdf2020.at[0,'85 yrs and Older']=Group13

#PENNSYLVANIA TO %
# Gets all of the percentages for ME Age Groups
Group1 = MEdf2020['Under 5 yrs'].div(MEMean)
Group2 = MEdf2020['5-9 yrs'].div(MEMean)
Group3 = MEdf2020['10-14 yrs'].div(MEMean)
Group4 = MEdf2020['15-19 yrs'].div(MEMean)
Group5 = MEdf2020['20-24 yrs'].div(MEMean)
Group6 = MEdf2020['25-34 yrs'].div(MEMean)
Group7 = MEdf2020['35-44 yrs'].div(MEMean)
Group8 = MEdf2020['45-54 yrs'].div(MEMean)
Group9 = MEdf2020['55-59 yrs'].div(MEMean)
Group10 = MEdf2020['60-64 yrs'].div(MEMean)
Group11 = MEdf2020['65-74 yrs'].div(MEMean)
Group12 = MEdf2020['75-84 yrs'].div(MEMean)
Group13 = MEdf2020['85 yrs and Older'].div(MEMean)
# Replaces ME nmumbers with percentages
LAMEdf2020.at[1,'Under 5 yrs']=Group1
LAMEdf2020.at[1,'5-9 yrs']=Group2
LAMEdf2020.at[1,'10-14 yrs']=Group3
LAMEdf2020.at[1,'15-19 yrs']=Group4
LAMEdf2020.at[1,'20-24 yrs']=Group5
LAMEdf2020.at[1,'25-34 yrs']=Group6
LAMEdf2020.at[1,'35-44 yrs']=Group7
LAMEdf2020.at[1,'45-54 yrs']=Group8
LAMEdf2020.at[1,'55-59 yrs']=Group9
LAMEdf2020.at[1,'60-64 yrs']=Group10
LAMEdf2020.at[1,'65-74 yrs']=Group11
LAMEdf2020.at[1,'75-84 yrs']=Group12
LAMEdf2020.at[1,'85 yrs and Older']=Group13

# Transpose the data one last time
LAMEdf2020 = LAMEdf2020.transpose()
LAMEdf2020

In [None]:
# set width of bar
barWidth = 0.35
fig = plt.subplots(figsize =(20, 10))
 
# set the values for each bar
StateLA = LAMEdf2020[0]
StateME = LAMEdf2020[1]


 
# Set position of bar on X axis
br1 = np.arange(len(LAMEdf2020))
br2 = [x + barWidth for x in br1]
 
# Make the plot
plt.bar(br1, StateLA, color ='red', width = barWidth,
         edgecolor ='grey', label ='Louisiana')
plt.bar(br2, StateME, color ='black', width = barWidth,
        edgecolor ='grey', label ='Maine')
 
# Adding Xticks
plt.title('Louisiana vs Maine Ages by Range (%)', fontweight ='bold', fontsize = 20)
plt.xlabel('Age Range', fontsize = 15)
plt.ylabel('% Number of People', fontsize = 15)
plt.xticks([r + barWidth for r in range(len(LAMEdf2020))],
        ['Under 5 yrs', '5-9 yrs', '10-14 yrs', '15-19 yrs', '20-24 yrs', '25-34 yrs', 
         '35-44 yrs', '45-54 yrs', '55-59 yrs', '60-64 yrs', '65-74 yrs', '75-84 yrs', '85 yrs and Older'])
 
plt.legend(fontsize=20)
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.show()