In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans

In [None]:
### Demographic Clustering ###

In [None]:
"""
In this section, I will perform a cluster analysis using the fields unrelated below:

age
gender
engagement
account.age

"""

In [None]:
### Reading in the data ### 

In [3]:
cluster_df = pd.read_csv(r"C:\Users\Epicm\Desktop\data\k_means\washington_survey_data.txt",sep="\t")

In [4]:
cluster_df= cluster_df.replace('Highly Engaged', 'Highly_Engaged')
cluster_df= cluster_df.replace('Not Engaged', 'Not_Engaged')

In [None]:
### Convert the data to numeric ###

In [5]:
cluster_df.gender = cluster_df.gender.map(dict(other=0,male=1, female=2))

In [6]:
cluster_df.engagement = cluster_df.engagement.map(dict(Not_Engaged=0, Engaged=1, Highly_Engaged=2))

In [None]:
### Normalize the Data ### 

In [7]:
required_cols = ['age','gender', 'engagement', 'account.age']

In [8]:
numeric_df = cluster_df.loc[:,required_cols]

In [9]:
numeric_df_normalized =(numeric_df-numeric_df.min())/(numeric_df.max()-numeric_df.min())

In [None]:
### Building our K Means Model ###

In [10]:
km_1 = KMeans(
    n_clusters=3, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)

In [11]:
clustering_1 = km_1.fit_predict(numeric_df_normalized)

In [12]:
cluster_df['cluster'] = clustering_1

In [None]:
### Results ###

In [13]:
for cluster in set(clustering_1) :
    print(f'Printing Results for Cluster {cluster}.\n\n')
    
    print(cluster_df.query(f"cluster == {cluster}"))
    
    print("\n\n")

Printing Results for Cluster 0.


         id  age  gender  engagement  mem.edu      zip channel  progressivism  \
1       348   66       2           2        3  98012.0  Branch       2.291667   
4       358   50       1           2        4  98233.0  Branch      -0.500000   
8       369   61       1           2        3  98133.0  Branch       0.500000   
9       371   71       2           2        6      NaN  Branch       3.166667   
10      372   73       2           2        6  98506.0  Branch       1.958333   
...     ...  ...     ...         ...      ...      ...     ...            ...   
2409  21255   45       2           2        4  98503.0  Branch       1.208333   
2410  21767   25       2           2        5      NaN  Branch       0.375000   
2411  21768   55       2           2        4  98579.0  Branch       1.916667   
2412  21872   64       2           2        4  99205.0  Branch       1.458333   
2415  22002   71       1           2        5  99224.0  Branch       0.1250

In [None]:
### Values Clustering ###

In [None]:
"""

Similarly to the previous section, perform a cluster analysis, this time on the values questions:

fair
harm
in.group
authority
purity
public.sector
sustainability
localism
After you’ve built your clusters, report the following information on each cluster:

Predominant region
Average age and account age
Most common focal value
Mean results on the questions of pub.greater.priv, experience.more.important, and teachers.underpaid.


"""

In [None]:
### Read in the data ### 

In [28]:
cluster_df_2 = pd.read_csv(r"C:\Users\Epicm\Desktop\data\k_means\washington_survey_data.txt",sep="\t")
cluster_df_2.rename(columns = {'public.sector':'public_sector'}, inplace = True)

In [None]:
### Convert Data to Numeric ### 

In [29]:
cluster_df_2.public_sector = cluster_df_2.public_sector.map(dict(no=0,yes=1))

In [None]:
### Normalize our data ###

In [30]:
required_cols_2 = ['fair','harm', 'in.group', 'authority', 'purity', 'public_sector', 'sustainability', 'localism']


In [31]:
numeric_df_2 = cluster_df_2.loc[:,required_cols_2]

In [32]:
numeric_df_2_norm =(numeric_df_2-numeric_df_2.min())/(numeric_df_2.max()-numeric_df_2.min())

In [None]:
### Building K Means Model ###

In [33]:
km_2 = KMeans(
    n_clusters=3, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)

In [34]:
clustering_2 = km_2.fit_predict(numeric_df_2_norm)

In [35]:
cluster_df_2['cluster'] = clustering_2

In [None]:
### Results ###

In [36]:
for cluster in set(clustering_2) :
    print(f'Printing Results for Cluster {cluster}.\n\n')
    
    print(cluster_df_2.query(f"cluster == {cluster}"))
    
    print("\n\n")

Printing Results for Cluster 0.


         id  age  gender      engagement  mem.edu      zip channel  \
3       352   66    male         Engaged        7      NaN  Branch   
13      375   79  female         Engaged        2  98532.0  Branch   
14      399   60    male     Not Engaged        3  98001.0    Loan   
17      425   31    male  Highly Engaged        3  98502.0  Branch   
18      426   29  female         Engaged        3      NaN   Other   
...     ...  ...     ...             ...      ...      ...     ...   
2404  20945   61  female     Not Engaged        4  98028.0  Branch   
2408  21209   65  female  Highly Engaged        4  98662.0  Branch   
2414  21962   49    male     Not Engaged        3  98596.0  Branch   
2417  22163   93  female         Engaged        5  98103.0  Branch   
2420  23329   27  female     Not Engaged        4  98502.0  Branch   

      progressivism  harm  fair  ...          region  public_sector  \
3          0.583333  4.25  4.75  ...      W WA Metro  

In [None]:
### Predominant Region Per Cluster ### 

In [38]:
cluster_0 = cluster_df_2[cluster_df_2['cluster'] == 0]
result_0 = cluster_0['region'].mode()
print(result_0)

0    W WA Metro
Name: region, dtype: object


In [39]:
cluster_1 = cluster_df_2[cluster_df_2['cluster'] == 1]
result_1 = cluster_1['region'].mode()
print(result_1)

0    W WA Metro
Name: region, dtype: object


In [40]:
cluster_2 = cluster_df_2[cluster_df_2['cluster'] == 2]
result_2 = cluster_2['region'].mode()
print(result_2)

0      Thurston
1    W WA Metro
Name: region, dtype: object


In [None]:
### Average Age Per Cluster ###

In [41]:
result_0 = cluster_0['age'].mean()
result_0

45.41470588235294

In [42]:
result_1 = cluster_1['age'].mean()
result_1

52.61685214626391

In [43]:
result_2 = cluster_2['age'].mean()
result_2

55.9779792746114

In [None]:
### Most Common Focal Value Per Cluster ### 

In [44]:
result_0 = cluster_0['main.focal.value'].mode()
result_0

0    Education
Name: main.focal.value, dtype: object

In [45]:
result_1 = cluster_1['main.focal.value'].mode()
result_1

0    Environment
Name: main.focal.value, dtype: object

In [46]:
result_2 = cluster_2['main.focal.value'].mode()
result_2

0    Health (i.e. cancer research)
Name: main.focal.value, dtype: object