# Step 1: Data Importing

In [5]:
# Importing modules
import pandas as pd
import numpy as np
import os

cwd = os.getcwd()
# Read data into papers
data = pd.read_csv(cwd+"/TMPPoliticalSurveyFULL_ForDavid.csv")

# Print head
print(data.columns)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd9 in position 15: invalid continuation byte

In [30]:
data.shape

(2392, 30)

# Step 2: Data Cleaning 
Since the goal of this analysis is to perform topic modeling, let's focus only on the text data from each paper, and drop other metadata columns. Also, for the demonstration, we'll only look at 100 papers

In [2]:
# Insert an id index for future work
data["Survey ID"] = data.index + 1

# Remove the columns
str_data = data[["Survey ID", "Can you please explain your answer to the previous question -- During the pandemic, some states have begun to release people early from prison. Which group do you think has the best chance of success on the outside?",\
            "Can you please explain your answer choice from the previous question -- What effect, if any, has the 1994 crime bill (Violent Crime Control and Law Enforcement Act) had on your incarceration?",\
            "Can you please explain your answer choice from the previous question -- What effect, if any, has the First Step Act had on your incarceration?"]]

# Print out the first rows of papers
str_data.columns = ["Survey ID", "Group_release", "1994Bill_impact", "FSA_impact"]
str_data = str_data.dropna()

In [3]:
# Load the regular expression library and the nltk word library
import re
import nltk

# Create a function to remove nonsense words
words = set(nltk.corpus.words.words())
def clean_sent(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())

strv_list = ["Group_release", "1994Bill_impact", "FSA_impact"]
for strv in strv_list:
    # Remove punctuation & Convert the titles to lowercase
    str_data[strv] = str_data[strv].map(lambda x: re.sub(r"(?<=\w)[^\s\w](,.?![^\s\w])", "", x)).map(lambda x: x.lower())
    # Remove nonsense words
    str_data[strv] = str_data[strv].apply(clean_sent)
    # Transform blank cells to NaN & Drop NaN
    str_data = str_data.replace(r'^\s*$', np.nan, regex=True).dropna()

# Step 3: Exploratory Analysis 
To verify whether the preprocessing, we’ll make a simple word cloud using the wordcloud package to get a visual representation of most common words. It is key to understanding the data and ensuring we are on the right track, and if any more preprocessing is necessary before training the model.

In [4]:
# Import the wordcloud library
from wordcloud import WordCloud

def make_wordcloud(strv_column, name):
    assert isinstance(name, str), "It should be the name of the variable you observed."
    
    # Join the different processed titles together.
    long_string = ','.join(list(strv_column.values))
    # Create a WordCloud object
    wordcloud = WordCloud(background_color="white", max_words=100000, contour_width=3, contour_color='steelblue')
    # Generate a word cloud
    wordcloud.generate(long_string)
        ## Visualize the word cloud
        #wordcloud.to_image()
    # Export the word cloud image
    wordcloud.to_file("{}.png".format(name))
    
    return

In [5]:
make_wordcloud(str_data["Group_release"], "Group_release")
make_wordcloud(str_data["1994Bill_impact"], "1994Bill_impact")
make_wordcloud(str_data["FSA_impact"], "FSA_impact")

# Step 4: Prepare text for LDA analysis 
Next, let’s work to transform the textual data in a format that will serve as an input for training LDA model. We start by tokenizing the text and removing stopwords. Next, we convert the tokenized object into a corpus and dictionary.

In [6]:
import gensim
from gensim.utils import simple_preprocess
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim.corpora as corpora

stop_words = stopwords.words("english")
stop_words.extend(["don", "people", "bill", "step", "act", "first", "u", "n", "na", "non", "violent"])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

def corpus_n_id2word(dataset, strv_column):
    str_list = dataset[strv_column].values.tolist()
    words = list(sent_to_words(str_list))
    # remove stop words
    texts = remove_stopwords(words)
    
    # Create Dictionary
    id2word = corpora.Dictionary(words)

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    return corpus, id2word,texts

In [7]:
group_corpus, group_id2, group_texts = corpus_n_id2word(str_data, "Group_release")
B1994_corpus, B1994_id2, B1994_texts = corpus_n_id2word(str_data, "1994Bill_impact")
FSA_corpus, FSA_id2, FSA_texts = corpus_n_id2word(str_data, "FSA_impact")

# Step 5: LDA model tranining and Topic Organizing
To keep things simple, we'll keep all the parameters to default except for inputting the number of topics. For this tutorial, we will build a model with 10 topics where each topic is a combination of keywords, and each keyword contributes a certain weightage to the topic.

In [8]:
from pprint import pprint

# lambd for calculating relevance
lambd = 0.6

# build LDA model for the three questions
group_lda_model = gensim.models.LdaMulticore(corpus=group_corpus, id2word=group_id2, num_topics=5)
B1994_lda_model = gensim.models.LdaMulticore(corpus=B1994_corpus, id2word=B1994_id2, num_topics=3)
FSA_lda_model = gensim.models.LdaMulticore(corpus=FSA_corpus, id2word=FSA_id2, num_topics=3)

In [9]:
import pyLDAvis
import pyLDAvis.gensim

def topic_word_table(lda_model, corpus, id2, num_topics, lambd):
    # build LDAvis for further analysis
    LDAvis_prepared = pyLDAvis.gensim.prepare(topic_model=group_lda_model,
                                              corpus=group_corpus, dictionary=group_id2)
    # organize relevance table
    all_topics = {}
    lambd = lambd  # Adjust this accordingly
    for i in range(1, num_topics+1): #Adjust number of topics in final model
        topic = LDAvis_prepared.topic_info[LDAvis_prepared.topic_info.Category == 'Topic'+str(i)]
        topic['relevance'] = topic['loglift']*(1-lambd)+topic['logprob']*lambd
        all_topics['Topic '+str(i)] = topic.sort_values(by='relevance', ascending=False).Term[:10].values
        
    # dict to dataframe
    newtable = pd.DataFrame.from_dict(all_topics, orient='index')
    
    return newtable

In [10]:
# Q group
group_table = topic_word_table(group_lda_model, group_corpus, group_id2, 5, lambd)
#print(group_table)
    # export csv
group_table.to_csv("group_twt.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [11]:
# Q B1994
B1994_table = topic_word_table(B1994_lda_model, B1994_corpus, B1994_id2, 3, lambd)
#print(B1994_table)
    # export csv
B1994_table.to_csv("B1994_twt.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [12]:
# Q FSA
FSA_table = topic_word_table(FSA_lda_model, FSA_corpus, FSA_id2, 3, lambd)
#print(FSA_table)
    # export csv
FSA_table.to_csv("FSA_twt.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


# Step 6: Finding the Dominat Topics

In [13]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    
    # Format
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1).reset_index()
    sent_topics_df.columns = ["Survey_ID", "Dominant_Topic", "Topic_Perc_Contrib", "Keywords", "Text"]
    
    return sent_topics_df

In [14]:
# Q group
group_topic_sents_keywords = format_topics_sentences(ldamodel=group_lda_model, corpus=group_corpus, texts=group_texts)
#print(group_topic_sents_keywords)
# stat aggregate
topic_stat = group_topic_sents_keywords[["Dominant_Topic", 
                                         "Topic_Perc_Contrib"]].groupby("Dominant_Topic").agg(["count", "mean"])
# to csv
topic_stat.to_csv("group_topic_contri.csv")

      Survey_ID  Dominant_Topic  Topic_Perc_Contrib  \
0             0             4.0              0.8612   
1             1             4.0              0.9455   
2             2             4.0              0.7295   
3             3             0.0              0.2000   
4             4             1.0              0.9086   
...         ...             ...                 ...   
1772       1772             0.0              0.7283   
1773       1773             3.0              0.5930   
1774       1774             0.0              0.2000   
1775       1775             0.0              0.2000   
1776       1776             0.0              0.9645   

                                               Keywords  \
0     chance, think, person, crime, prison, would, b...   
1     chance, think, person, crime, prison, would, b...   
2     chance, think, person, crime, prison, would, b...   
3     change, less, time, want, chance, one, crime, ...   
4     life, get, back, crime, person, know, 

In [16]:
# Q B1994
B1994_topic_sents_keywords = format_topics_sentences(ldamodel=B1994_lda_model, corpus=B1994_corpus, texts=B1994_texts)
# stat aggregate
topic_stat2 = B1994_topic_sents_keywords[["Dominant_Topic", 
                                          "Topic_Perc_Contrib"]].groupby("Dominant_Topic").agg(["count", "mean"])
# to csv
topic_stat2.to_csv("B1994_topic_contri.csv")

In [17]:
# Q FSA
FSA_topic_sents_keywords = format_topics_sentences(ldamodel=FSA_lda_model, corpus=FSA_corpus, texts=FSA_texts)
# stat aggregate
topic_stat3 = FSA_topic_sents_keywords[["Dominant_Topic", 
                                        "Topic_Perc_Contrib"]].groupby("Dominant_Topic").agg(["count", "mean"])
# to csv
topic_stat3.to_csv("FSA_topic_contri.csv")

# Step 7: Topic Clustering

In [28]:
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook


# Get topic weights
topic_weights = []
for i, row_list in enumerate(group_lda_model[group_corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 4
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)


TypeError: cannot unpack non-iterable int object

# Step 8: Visualizing Topic Interaction

In [26]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(group_lda_model, group_corpus, dictionary=group_lda_model.id2word)

In [21]:
pyLDAvis.save_html(vis, 'group_lda.html')

In [23]:
pyLDAvis.show(vis)

FileNotFoundError: [Errno 2] No such file or directory: 'https://raw.githubusercontent.com/bmabey/pyLDAvis/master/pyLDAvis/js/ldavis.v1.0.0.css'

In [31]:
pyLDAvis.display(vis)