# Step 1: Data Importing

In [4]:
# Importing modules
import pandas as pd
import numpy as np
import os

# Read data into papers
data = pd.read_csv("~/Google Drive/Research Assistant/Work With David Knight/Survey of the Incarcerated/TMPPoliticalSurveyFULL_ForDavid.csv", encoding = "ISO-8859-1")

# Print head
print(data.columns)

Index(['state', 'facility_type_jail', 'facility_type_prison',
       'how_often_discuss_politics', 'how_get_news', 'news_source',
       'ever_voted', 'direction_country_headed',
       'how_often_officials_acting_in_your_interest',
       'which_party_for_cj_reform', 'stance_on_assault_weapons_ban',
       'stance_on_marijuana_legalization',
       'stance_on_tightening_border_security', 'stance_on_raise_min_wage',
       'country_most_important_problem', 'race_affects_politics',
       'explain_race_affects_politics', 'should_incarcerated_vote',
       'incarceration_impacts_motivation_to_vote',
       'politics_changed_since_incarcerated',
       'explain_politics_changed_since_incarcerated',
       'cj_important_issue_eliminating_mandatory_mins',
       'cj_important_issue_reducing_racial_bias',
       'cj_important_issue_abolishing_death_penalty',
       'cj_important_issue_lowering_incarceration_rates_rural_communities',
       'cj_important_issue_improving_prison_conds',
       

# Step 2: Data Cleaning 
Since the goal of this analysis is to perform topic modeling, let's focus only on the text data from each paper, and drop other metadata columns. Also, for the demonstration, we'll only look at 100 papers

In [31]:
# Insert an id index for future work
data["Survey_ID"] = data.index + 1

# Remove the columns
str_data = data[["Survey_ID", "explain_politics_changed_since_incarcerated",\
                 "explain_race_affects_politics", 'identifies_as_black', 'identifies_as_white',
                 'identifies_as_native', 'identifies_as_asian', 
                 'identifies_as_hawaiian_or_pac_islander', 'identifies_as_other_race',
                 'identifies_as_not_sure_of_race', 'identifies_as_hispanic_or_latinx']]
# Print out the first rows of papers
str_data.columns = ["Survey_ID", "p_change", "r_effect", "black", "white", "native", "asian",
                   "hawaiian", "other_race", "unsure_race", "latinx"]
str_data = str_data.dropna()

In [3]:
# Load the regular expression library and the nltk word library
import re
import nltk

# Create a function to remove nonsense words
words = set(nltk.corpus.words.words())
def clean_sent(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())

strv_list = ["p_change", "r_effect"]
for strv in strv_list:
    # Remove punctuation & Convert the titles to lowercase
    str_data[strv] = str_data[strv].map(lambda x: re.sub(r"(?<=\w)[^\s\w](,.?![^\s\w])", "", x)).map(lambda x: x.lower())
    # Remove nonsense words
    str_data[strv] = str_data[strv].apply(clean_sent)
    # Transform blank cells to NaN & Drop NaN
    str_data = str_data.replace(r'^\s*$', np.nan, regex=True).dropna()

# Step 3: Exploratory Analysis 
To verify whether the preprocessing, we’ll make a simple word cloud using the wordcloud package to get a visual representation of most common words. It is key to understanding the data and ensuring we are on the right track, and if any more preprocessing is necessary before training the model.

In [4]:
# Import the wordcloud library
from wordcloud import WordCloud

def make_wordcloud(strv_column, name):
    assert isinstance(name, str), "It should be the name of the variable you observed."
    
    # Join the different processed titles together.
    long_string = ','.join(list(strv_column.values))
    # Create a WordCloud object
    wordcloud = WordCloud(background_color="white", max_words=100000, contour_width=3, contour_color='steelblue')
    # Generate a word cloud
    wordcloud.generate(long_string)
        ## Visualize the word cloud
        #wordcloud.to_image()
    # Export the word cloud image
    wordcloud.to_file("{}.png".format(name))
    
    return

In [5]:
make_wordcloud(str_data["p_change"], "p_change")
make_wordcloud(str_data["r_effect"], "r_effect")

# Step 4: Prepare text for LDA analysis 
Next, let’s work to transform the textual data in a format that will serve as an input for training LDA model. We start by tokenizing the text and removing stopwords. Next, we convert the tokenized object into a corpus and dictionary.

In [6]:
import gensim
from gensim.utils import simple_preprocess
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim.corpora as corpora

stop_words = stopwords.words("english")
stop_words.extend(["don", "people", "bill", "step", "act", "first", "u", "n", "na", "non", "violent"])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

def corpus_n_id2word(dataset, strv_column):
    str_list = dataset[strv_column].values.tolist()
    words = list(sent_to_words(str_list))
    # remove stop words
    texts = remove_stopwords(words)
    
    # Create Dictionary
    id2word = corpora.Dictionary(words)

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    return corpus, id2word,texts

In [7]:
pc_corpus, pc_id2, pc_texts = corpus_n_id2word(str_data, "p_change")
re_corpus, re_id2, re_texts = corpus_n_id2word(str_data, "r_effect")

# Step 5: LDA model tranining and Topic Organizing
To keep things simple, we'll keep all the parameters to default except for inputting the number of topics. For this tutorial, we will build a model with 10 topics where each topic is a combination of keywords, and each keyword contributes a certain weightage to the topic.

In [8]:
from pprint import pprint

# lambd for calculating relevance
lambd = 0.6

# build LDA model for the three questions
pc_lda_model = gensim.models.LdaMulticore(corpus=pc_corpus, id2word=pc_id2, num_topics=6)
re_lda_model7 = gensim.models.LdaMulticore(corpus=re_corpus, id2word=re_id2, num_topics=7)
re_lda_model6 = gensim.models.LdaMulticore(corpus=re_corpus, id2word=re_id2, num_topics=6)

In [9]:
import pyLDAvis
import pyLDAvis.gensim

def topic_word_table(lda_model, corpus, id2, num_topics, lambd):
    # build LDAvis for further analysis
    LDAvis_prepared = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=id2)
    # organize relevance table
    all_topics = {}
    lambd = lambd  # Adjust this accordingly
    for i in range(1, num_topics+1): #Adjust number of topics in final model
        topic = LDAvis_prepared.topic_info[LDAvis_prepared.topic_info.Category == 'Topic'+str(i)]
        topic['relevance'] = topic['loglift']*(1-lambd)+topic['logprob']*lambd
        all_topics['Topic '+str(i)] = topic.sort_values(by='relevance', ascending=False).Term[:10].values
        
    # dict to dataframe
    newtable = pd.DataFrame.from_dict(all_topics, orient='index')
    
    return newtable

In [17]:
# Q political change
pc_table = topic_word_table(pc_lda_model, pc_corpus, pc_id2, 6, lambd)
#print(pc_table)
    # export csv
pc_table.to_csv("pc_twt.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [18]:
# Q racial effect - 7 topics
pc_table = topic_word_table(re_lda_model7, re_corpus, re_id2, 7, lambd)
#print(pc_table)
    # export csv
pc_table.to_csv("re7_twt.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [19]:
# Q racial effect - 6 topics
pc_table = topic_word_table(re_lda_model6, re_corpus, re_id2, 6, lambd)
#print(pc_table)
    # export csv
pc_table.to_csv("re6_twt.csv")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


# Step 6: Finding the Dominat Topics

In [13]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    
    # Format
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1).reset_index()
    sent_topics_df.columns = ["Survey_ID", "Dominant_Topic", "Topic_Perc_Contrib", "Keywords", "Text"]
    
    return sent_topics_df

In [24]:
# Q political change
pc_topic_sents_keywords = format_topics_sentences(ldamodel=pc_lda_model, corpus=pc_corpus, texts=pc_texts)
print(pc_topic_sents_keywords)
# stat aggregate
topic_stat = pc_topic_sents_keywords[["Dominant_Topic",
                                      "Topic_Perc_Contrib"]].groupby("Dominant_Topic").agg(["count", "mean"])
# to csv
topic_stat.to_csv("pc_topic_contri.csv")

      Survey_ID  Dominant_Topic  Topic_Perc_Contrib  \
0             0             0.0              0.1667   
1             1             2.0              0.9619   
2             2             3.0              0.7205   
3             3             0.0              0.5806   
4             4             0.0              0.5008   
...         ...             ...                 ...   
3299       3299             0.0              0.1667   
3300       3300             0.0              0.1667   
3301       3301             3.0              0.9148   
3302       3302             2.0              0.8312   
3303       3303             0.0              0.1667   

                                               Keywords  \
0     prison, politics, care, yes, political, system...   
1     political, never, get, politics, yes, prison, ...   
2     yes, never, country, better, see, time, system...   
3     prison, politics, care, yes, political, system...   
4     prison, politics, care, yes, political

In [15]:
# Q racial effect - 7 topics
re_topic_sents_keywords = format_topics_sentences(ldamodel=re_lda_model7, corpus=re_corpus, texts=re_texts)
# stat aggregate
topic_stat2 = re_topic_sents_keywords[["Dominant_Topic", 
                                       "Topic_Perc_Contrib"]].groupby("Dominant_Topic").agg(["count", "mean"])
# to csv
topic_stat2.to_csv("re7_topic_contri.csv")

In [16]:
# Q racial effect - 6 topics
re_topic_sents_keywords = format_topics_sentences(ldamodel=re_lda_model6, corpus=re_corpus, texts=re_texts)
# stat aggregate
topic_stat3 = re_topic_sents_keywords[["Dominant_Topic", 
                                       "Topic_Perc_Contrib"]].groupby("Dominant_Topic").agg(["count", "mean"])
# to csv
topic_stat3.to_csv("re6_topic_contri.csv")

In [35]:
re_topic = str_data.merge(re_topic_sents_keywords, on="Survey_ID")

Unnamed: 0,Survey_ID,p_change,r_effect,black,white,native,asian,hawaiian,other_race,unsure_race,latinx,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,1,[No],"[Not at all] I'm not racist, all of the human ...",False,True,False,False,False,False,False,False,2.0,0.5828,"race, sure, prison, black, white, racist, know...",[sure]
1,3,I feel that the parole board gives the some of...,I'm not sure about that,False,True,False,False,False,False,False,False,1.0,0.8318,"race, deal, racial, prison, great, political, ...","[understand, question, maybe, typo]"
2,4,Yes alittle bit,I am not really sure about that one,False,True,False,False,False,False,False,False,3.0,0.7466,"dont, white, race, racial, prison, like, see, ...","[try, vote, based, many, vote, white]"
3,6,Most politicians are full of shit.,"I do not understand this question, maybe there...",True,False,False,False,False,False,False,False,3.0,0.7207,"dont, white, race, racial, prison, like, see, ...","[dont, know]"
4,7,I never considered the prison population or pr...,I try to vote based on the issues but I am whi...,False,True,False,False,False,False,False,False,3.0,0.8798,"dont, white, race, racial, prison, like, see, ...","[racism, still, big, problem, today, outside]"


In [75]:
pc_topic = str_data.merge(pc_topic_sents_keywords, on="Survey_ID")
f = pc_topic[pc_topic["other_race"]==True]
a = f[["Dominant_Topic", "Topic_Perc_Contrib"]].groupby("Dominant_Topic").agg(["count", "mean"])
a.columns = a.columns.droplevel()
a["x"] = round(a["count"]/f.shape[0], 5)
a.to_csv("b.csv")

In [84]:
pc_topic[pc_topic["hawaiian"]==True].shape

(24, 15)

# Step 7: Topic Clustering

from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook


# Get topic weights
topic_weights = []
for i, row_list in enumerate(group_lda_model[group_corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 4
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)


# Step 8: Visualizing Topic Interaction

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(group_lda_model, group_corpus, dictionary=group_lda_model.id2word)

pyLDAvis.save_html(vis, 'group_lda.html')