# Analysis of US Demographics From 2010 to 2020

In [None]:
#IMPORT THESE IMPORTANT MODULES
import matplotlib.pylab as plt
import plotly.express as px
import numpy as np
import pandas as pd
import seaborn as sns
import utilities
%load_ext autoreload
%autoreload 2

## Reading in the Datasets and Turning Them into Dataframes

#### Income

In [None]:
infilename = "Data Sets/2000 Income Cleaned.csv"
df2000Income = utilities.parse_csv_file_to_dataframe(infilename, filetype='income', fileyear='2000')

print('2000 Income Dataframe')
#df2000Income

In [None]:
infilename = "Data Sets/2010 Income Cleaned.csv"
df2010Income = utilities.parse_csv_file_to_dataframe(infilename, filetype='income', fileyear='2010')

print('2010 Income Dataframe')
#df2010Income

In [None]:
infilename = "Data Sets/2020 Income Cleaned.csv"
df2020Income = utilities.parse_csv_file_to_dataframe(infilename, filetype='income', fileyear='2020')

print('2020 Income Dataframe')
#df2020Income

#### Population and Demographics

In [None]:
infilename = "Data Sets/2000 Population Cleaned.csv"
df2000Demographics = utilities.parse_csv_file_to_dataframe(infilename, filetype='demographics', fileyear='2000')

print('2000 Demographics Dataframe')
#df2000Demographics

In [None]:
infilename = "Data Sets/2010 Population and Demographics Cleaned.csv"
df2010Demographics = utilities.parse_csv_file_to_dataframe(infilename, filetype='demographics', fileyear='2010')

print('2010 Demographics Dataframe')
#df2010Demographics

In [None]:
infilename = "Data Sets/2020 Population and Demographics Cleaned.csv"
df2020Demographics = utilities.parse_csv_file_to_dataframe(infilename, filetype='demographics', fileyear='2020')

print('2020 Demographics Dataframe')
#df2020Demographics

#### Removing Non-States from the Dataset

In [None]:
#Drop Non-States into a Mask
mask = (df2000Income['State'] == 'DC') | (df2000Income['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2000Income = df2000Income[~mask]
#df2000Income

#Drop Non-States into a Mask
mask = (df2010Income['State'] == 'DC') | (df2010Income['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2010Income = df2010Income[~mask]
#df2010Income

#Drop Non-States into a Mask
mask = (df2020Income['State'] == 'DC') | (df2020Income['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2020Income = df2020Income[~mask]
#df2020Income

#Drop Non-States into a Mask
mask = (df2000Demographics['State'] == 'DC') | (df2000Demographics['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2000Demographics = df2000Demographics[~mask]
#df2000Demographics

#Drop Non-States into a Mask
mask = (df2010Demographics['State'] == 'DC') | (df2010Demographics['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2010Demographics = df2010Demographics[~mask]
#df2010Demographics

#Drop Non-States into a Mask
mask = (df2020Demographics['State'] == 'DC') | (df2020Demographics['State'] == 'PR')
#Reassign the dataframe to give it the same name
df2020Demographics = df2020Demographics[~mask]
#df2020Demographics

## Graphs of Features by Year

### Making a New Dataframe With the Calculated Differences and Fractions

In [None]:
#New Dataframe with State Names Copied Over
dfnew = df2010Demographics[['State']].copy()

In [None]:
#Adding all of the calcualted values to the new Dataframe dfnew
#Total Population
a = df2010Demographics['Population']
b = df2020Demographics['Population']
c = b - a
dfnew['Population Change'] = c
dfnew['Population Change Fraction'] = c/a
#Number of Males
d = df2010Demographics['Male']
e = df2020Demographics['Male']
f = e - d
dfnew['Male Change'] = f
dfnew['Male Change Fraction'] = f/d
#Number of Females
g = df2010Demographics['Female']
h = df2020Demographics['Female']
i = h - g
dfnew['Female Change'] = i
dfnew['Female Change Fraction'] = i/g
#Median Income per Household
j = df2010Income['Median']
k = df2020Income['Median']
l = k - j
dfnew['Median Income Change'] = l
dfnew['Median Income Change Fraction'] = l/j
#Mean Income per Household
m = df2010Income['Mean']
n = df2020Income['Mean']
o = n - m
dfnew['Mean Income Change'] = o
dfnew['Mean Income Change Fraction'] = o/m
#Number of White as Only Race
p = df2010Demographics['White']
q = df2020Demographics['White']
r = q - p
dfnew['White Change'] = r
dfnew['White Change Fraction'] = r/p
#Number of Black or African American as Only Race
t = df2010Demographics['Black or African American']
u = df2020Demographics['Black or African American']
v = u - t
dfnew['Black or African American Change'] = v
dfnew['Black or African American Change Fraction'] = v/t
#Printing out a snipit of the new dataframe that was created
dfnew[0:10]

### Creating the Graphs

#### Population

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Population Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Population Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Population Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Population Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.grid()

#### Sex (Female, Male)

##### Male

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Male Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Male Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Male Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Male Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Male Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Male Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.grid()

##### Female

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Female Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Female Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Female Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Female Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Female Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Female Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.grid()

##### Male/Female Same Graph

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Male Change','Female Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Male Change',ax=plt.gca(), color='blue', alpha=0.8, align='edge')
dfnew.plot.bar(x='State', y='Female Change', ax=plt.gca(), color='pink', alpha=1.0, align='center')
plt.title("2010 vs 2020 U.S. Male and Female Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Male Change Fraction','Female Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Male Change Fraction',ax=plt.gca(), color='blue', alpha=0.8, align='edge')
dfnew.plot.bar(x='State', y='Female Change Fraction', ax=plt.gca(), color='pink', alpha=1.0, align='center')
plt.title("2010 vs 2020 U.S. Male and Female Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

#### Median Income

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Median Income Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Median Income Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Median Income Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Median Income Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Median Income Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Median Income Change (Fraction)", fontsize=20, fontweight ='bold')
plt.axhline(y=0.19, color='r', linewidth=3)
plt.grid()

#### Mean Income

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Mean Income Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Mean Income Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Mean Income Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Mean Income Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Mean Income Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Mean Income Change (Fraction)", fontsize=20, fontweight ='bold')
plt.axhline(y=0.19, color='r', linewidth=3)
plt.grid()

#### Racial (Black or African American, White)

##### Black or African American

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Black or African American Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Black or African American Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Black or African American Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['Black or African American Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='Black or African American Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. Black or African American Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.grid()

##### White

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['White Change'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='White Change',ax=plt.gca())
plt.title("2010 vs 2020 U.S. White Population Change", fontsize=20, fontweight ='bold')
plt.ticklabel_format(style='plain', useOffset=False, axis='y')
plt.grid()

In [None]:
# Sorting the dataframe from lowest to highest so it graphs nicer
dfnew = dfnew.sort_values(by=['White Change Fraction'], ascending=True)
#dfnew

In [None]:
plt.figure(figsize=(16,8))
dfnew.plot.bar(x='State',y='White Change Fraction',ax=plt.gca())
plt.title("2010 vs 2020 U.S. White Population Change (Fraction)", fontsize=20, fontweight ='bold')
plt.grid()

## State Clustering

In [None]:
# We can make use of the `hue` argument to use color to distinguish the species

g = sns.PairGrid(dfnew)
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
X = dfnew[['Population Change', 'Population Change Fraction', 'Male Change', 'Male Change Fraction',
           'Female Change', 'Female Change Fraction', 'Median Income Change', 'Median Income Change Fraction',
           'Mean Income Change', 'Mean Income Change Fraction', 'White Change', 'White Change Fraction',
           'Black or African American Change', 'Black or African American Change Fraction']].values
#print(X)

In [None]:
# Create the model with some number of clusters
# Choose the initial centroids at random
# Run it with 10 different centroid seeds
# Set the random seed to something deterministic (0 in this case)
model = KMeans(n_clusters=3, init='random', n_init=10, random_state=0)

# Run the clustering algorithm! 
labels_km = model.fit_predict(X)

In [None]:
labels_km

In [None]:
print(model.cluster_centers_)

In [None]:
plt.figure(figsize=(10,8))

# Plot the 3 clusters, making use of masking to display the first
# two features (the 0th and 1st column), plotted versus each other
plt.scatter(
    X[labels_km == 0, 0], X[labels_km == 0, 1],
    s=100, c='blue',
    marker='s', edgecolor='black',
    label='cluster 1'
)

plt.scatter(
    X[labels_km == 1, 0], X[labels_km == 1, 1],
    s=100, c='orange',
    marker='o', edgecolor='black',
    label='cluster 2'
)

plt.scatter(
    X[labels_km == 2, 0], X[labels_km == 2, 1],
    s=100, c='green',
    marker='v', edgecolor='black',
    label='cluster 3'
)

# Plot the centroids
plt.scatter(
    model.cluster_centers_[:, 0], model.cluster_centers_[:, 1],
    s=750, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.xlabel('Sepal length',fontsize=18)
plt.ylabel('Sepal width', fontsize=18)
plt.legend(scatterpoints=1)
plt.grid()

In [None]:
# Create the model with some number of clusters
# Try 2 clusters
model2 = KMeans(n_clusters=2, init='random', n_init=10, random_state=0)

# Run the clustering algorithm! 
labels_km2 = model2.fit_predict(X)

plt.figure(figsize=(10,8))

# Plot the clusters, making use of masking to display the first
# two features (the 0th and 1st column), plotted versus each other

for i in range(0,2):

  plt.scatter(
      X[labels_km2 == i, 0], X[labels_km2 == i, 1],
      s=100,
      marker='s', edgecolor='black',
      label=f'cluster {i+1}'
  )


# Plot the centroids
plt.scatter(
    model2.cluster_centers_[:, 0], model2.cluster_centers_[:, 1],
    s=750, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.xlabel('Sepal length',fontsize=18)
plt.ylabel('Sepal width', fontsize=18)
plt.legend(scatterpoints=1)
plt.grid()


pd.crosstab(dfnew['State'], labels_km2)

In [None]:
# Create the model with some number of clusters
# Try 6 clusters
model3 = KMeans(n_clusters=6, init='random', n_init=10, random_state=0)

# Run the clustering algorithm! 
labels_km3 = model3.fit_predict(X)

plt.figure(figsize=(10,8))

# Plot the clusters, making use of masking to display the first
# two features (the 0th and 1st column), plotted versus each other

for i in range(0,6):
  plt.scatter(
      X[labels_km3 == i, 0], X[labels_km3 == i, 1],
      s=100,
      marker='s', edgecolor='black',
      label=f'cluster {i+1}'
  )

# Plot the centroids
plt.scatter(
    model3.cluster_centers_[:, 0], model3.cluster_centers_[:, 1],
    s=750, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.xlabel('Sepal length',fontsize=18)
plt.ylabel('Sepal width', fontsize=18)
plt.legend(scatterpoints=1)
plt.grid()


pd.crosstab(dfnew['State'], labels_km3)

In [None]:
print(model.inertia_)
print(model2.inertia_)
print(model3.inertia_)

In [None]:
inertias = []
nclusters = []

# Try from 1 to 10 clusters
for i in range(1,11):
  model_temp = KMeans(n_clusters=i, init='random', n_init=10, random_state=0)
  model_temp.fit(X)

  nclusters.append(i)
  inertias.append(model_temp.inertia_)

plt.figure(figsize=(8,6))
plt.plot(nclusters, inertias)
plt.xlabel('# of clusters', fontsize=18)
plt.ylabel('inertia', fontsize=18)

## HERE

do next set of work here