## K Means and General Cluster Analysis to Discover Subgroups in Discussion about Mental Health 

## Necessary Import Statements 

In [None]:
import indicoio, json, re
from urlextract import URLExtract
import numpy as np
import matplotlib.pyplot as plt
import datetime
from empath import Empath
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import plotly.plotly.plotly
from plotly.offline import init_notebook_mode
import plotly.graph_objs as go
analyser = SentimentIntensityAnalyzer()
lexicon = Empath()
nltk.download('stopwords')
import plotly.offline as plotoff 
plotoff.init_notebook_mode()
%load_ext autoreload
%autoreload 2





## Data Cleaning and URL extraction 

In [2]:
with open('final_indico.json', 'r') as myfile:
    data2 = json.loads(myfile.read())
    
extractor = URLExtract()

timestamp = []
sentiment = []
political = []

num = 0
for x in data2['comments']:
    num += 1
    
    # TEXT CLEANING
    input_str = data2['comments'][x]['body'].lower() # all letters become lowercase 
    
    url = extractor.find_urls(input_str)          # extracts and removes url
    if(len(url) != 0):
        input_str = input_str.replace(url[0],'')

    input_str = re.sub(r'\d+', '', input_str)     # removes numbers
    input_str = input_str.strip()                 # removes whitespace
    input_str = re.sub(r'[^\w\s]', '', input_str)   # removes punctuation
    
    
    
    # STORING THE TIMESTAMP, SENTIMENT, POLITICAL DATA IN SEPARATE LISTS
    if( 'timestamp' in data2['comments'][x] and input_str):
        timestamp.append(data2['comments'][x]['timestamp'])
        #sentiment.append(indicoio.sentiment(input_str))
        #political.append(indicoio.political(input_str))
        
    data2['comments'][x]['body'] = input_str
    


## Subsetting the Comments based on the LSA Model on Mental Health 

In [3]:
#SUBSETING COMMENTS 
subsetList = set()
for x in data2["comments"]:
    if ("mental") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("health") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("shoot") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("amend") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("nra") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("problem") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("illness") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("issue") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("firearm") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("mass") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("shooting") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("countries") in data2["comments"][x]["body"]:
        subsetList.add(x)
    if ("law") in data2["comments"][x]["body"]:
        subsetList.add(x)
print(len(subsetList))

15561


## Calculate the Sentiment and Political Affiliation 

In [4]:
DListPolAff = list()
DListSentiment = list()
RListPolAff = list()
RListPolSentiment = list()
Rcomments = list()
Dcomments = list()
DCount = 0
RCount = 0
ErrorCount = 0
for x in subsetList:
    try:
        if data2["comments"][x]["pol_aff"]["Conservative"] > data2["comments"][x]["pol_aff"]["Liberal"]:
            RListPolAff.append(data2["comments"][x]["pol_aff"]["Conservative"])
            ds = analyser.polarity_scores(data2["comments"][x]["body"])
            RListPolSentiment.append(ds["compound"])
            Rcomments.append(data2["comments"][x]["body"])
            RCount = RCount + 1
        else:
            DListPolAff.append(data2["comments"][x]["pol_aff"]["Liberal"])
            ds = analyser.polarity_scores(data2["comments"][x]["body"])
            DListSentiment.append(ds["compound"])
            Dcomments.append(data2["comments"][x]["body"])
            DCount = DCount + 1
    except:
        ErrorCount = ErrorCount + 1
    
print(DCount)
print(RCount)
print(ErrorCount)

5908
9616
37


## Plot a subset of the Political Affiliation to Understand how Republicans and Democrats are Distributed Spatially 

In [31]:
Republican = go.Scatter(
    x = RListPolAff[1:500],
    mode = 'markers',
)

Democrat = go.Scatter(
    x = DListPolAff[1:500],
    mode = 'markers',
)

fig = plotly.tools.make_subplots(rows=2, cols=1)

fig.append_trace(Republican, 1, 1)
fig.append_trace(Democrat, 2, 1)

plotoff.iplot({
    "data": fig,
    "layout": go.Layout(title="Republican Users Sentiment versus Democrat Users Sentiment",
                        xaxis=dict(
        title='Republican Political Sentiment of the Comment',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Democrat Political Sentiment of the Comment',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
                        )
})

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]



'temp-plot.html'

## Calculate cluster coherence in order to show significance of certain subgroups

In [6]:
# Adapted from https://plot.ly/scikit-learn/plot-kmeans-silhouette-analysis/

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score

n_cluster = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

silhouette_avg = []

for n_clusters in n_cluster:
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    reduced_data = PCA(n_components=2).fit_transform(df)
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters,  n_init=10)
    kmeans.fit(reduced_data)
    cluster_labels = kmeans.fit_predict(reduced_data)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg.append(silhouette_score(reduced_data, cluster_labels))
    print(n_clusters)
    
    
figure = go.Scatter(
    x = n_cluster,
    y = silhouette_avg,
    mode = 'lines+markers')

plotoff.iplot({
    "data": [figure],
    "layout": go.Layout(title="Number of Clusters vs. Silhouette Avg",
                        xaxis=dict(
        title='Number of Clusters',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Silhouette Score',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
                        )
})

0.3116864065995788


### K-means Clsutering using PCA Reduced Data in order to show Sub-Groups that arise within Discussion, Maximizing the Silhouette

In [20]:
# K-means clustering, adapted from tutorial from https://plot.ly/scikit-learn/plot-kmeans-digits/
from sklearn.cluster import KMeans
from pandas import DataFrame
from sklearn import metrics
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
import gc

subsetList = 0

data2 = 0

np.random.seed(seed=1000)

choiceRand = np.random.rand(1500)

subset1 = []

subset2 = []

for a in range(len(RListPolSentiment)):
    if RListPolSentiment[a] > 0.05 or RListPolSentiment[a] < -0.05:
        subset1.append(a)
        
for b in range(len(DListSentiment)):
    if DListSentiment[b] > 0.05 or DListSentiment[b] < -0.05:
        subset2.append(b)
    


randIndicesR = np.random.choice(subset1, 1500)
RListPolSentimentRandomized = [RListPolSentiment[i] for i in randIndicesR]
RCommentsRandomized = [Rcomments[i] for i in randIndicesR]

randIndicesD = np.random.choice(subset2, 1500)
DListPolSentimentRandomized = [DListSentiment[i] for i in randIndicesD]
DCommentsRandomized = [DListSentiment[i] for i in randIndicesD]

Data = {"RSent" : RListPolSentimentRandomized,
        "DSent": DListPolSentimentRandomized,
        "RPolAff": np.random.choice([RListPolAff[i] for i in randIndicesR], 1500),
        "DPolAff": np.random.choice([DListPolAff[i] for i in randIndicesD], 1500)}

df = DataFrame(Data, columns=['RSent','DSent', "RPolAff", "DPolAff"])

def matplotlib_to_plotly(cmap, pl_entries):
    h = 1.0/(pl_entries-1)
    pl_colorscale = []
    
    for k in range(pl_entries):
        C = list(map(np.uint8, np.array(cmap(k*h)[:3])*255))
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
        
    return pl_colorscale


reduced_data = PCA(n_components=2).fit_transform(df)
kmeans = KMeans(init='k-means++', n_clusters=4,  n_init=10)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
gc.collect()
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

back = go.Heatmap(x=xx[0][:len(Z)],
                  y=xx[0][:len(Z)],
                  z=Z,
                  showscale=False,
                  colorscale=matplotlib_to_plotly(plt.cm.Paired, len(Z)))

markers = go.Scatter(x=reduced_data[:, 0], 
                     y=reduced_data[:, 1],
                     showlegend=False,
                     mode='markers', 
                     marker=dict(
                             size=3, color='black'))

# Plot the centroids as a white 
centroids = kmeans.cluster_centers_
center = go.Scatter(x=centroids[:, 0],
                    y=centroids[:, 1],
                    showlegend=False,
                    mode='markers', 
                    marker=dict(
                            size=10, color='white'))
data=[back, markers, center]

layout = go.Layout(title ='K-means clustering with Sentiment (PCA-reduced data)<br>'
                           'On Subsetted dataset relating to Mental Health',
                   xaxis=dict(ticks='', showticklabels=False,
                              zeroline=False, title='2-D Sentiment (-1 to 1)'),
                   yaxis=dict(ticks='', showticklabels=False,
                              zeroline=False))
fig = go.Figure(data=data, layout=layout)

plotoff.iplot(fig, auto_open=True)


'temp-plot.html'

### Get all of the Comments with their class Labels, to show Class Labeling 

In [51]:
print(np.unique(kmeans.labels_))

lab = kmeans.predict(reduced_data)


cm1R = []

cm2R = []

cm3R = []

cm4R = []

# Republican Comments 

print("Repub")

for a in range(len(randIndicesR)):
    dec = kmeans.predict(np.array((RListPolSentiment[randIndicesR[a]], 
                 RListPolAff[randIndicesR[a]])).reshape((1, -1)))
    if dec == 0:
        cm1R.append(Rcomments[a])
    elif dec == 1:
        cm2R.append(Rcomments[a])
    elif dec == 2:
        cm3R.append(Rcomments[a])
    elif dec == 3:
        cm4R.append(Rcomments[a])

print("Democrat")

cm1D = []

cm2D = []

cm3D = []

cm4D = []

for b in range(len(randIndicesD)):
    dec = kmeans.predict(np.array((DListSentiment[randIndicesD[b]], 
                 DListPolAff[randIndicesD[b]])).reshape((1, -1)))
    if dec == 0:
        cm1D.append(Dcomments[b])
    elif dec == 1:
        cm2D.append(Dcomments[b])
    elif dec == 2:
        cm3D.append(Dcomments[b])
    elif dec == 3:
        cm4D.append(Dcomments[b])

[0 1 2 3]
Repub
Democrat


### Picking out some of the comments based on their class Labeling reveal indicators of Bias and Sentiment as seen on the K-Means Plot

In [None]:
print("Comments Republican")
print(cm1R[1])
print()
print(cm2R[4])
print()
print(cm3R[11]) #5, 6, 10 
print()
print(cm4R[8])
print()
print("Comments Democrat")
print(cm1D[1])
print()
print(cm2D[1])
print()
print(cm3D[3])
print()
print(cm4D[4])
