In [None]:
# General
import pandas as pd
import numpy as np
from dfply import *

# Formatting Text
import re
import string

# Text Analysis
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import spacy
import en_core_web_lg
from gensim.summarization.summarizer import summarize 
from gensim.summarization import keywords 
from spacy import displacy
import textacy

# Plotting
import matplotlib.pyplot as plt
from datetime import date
import networkx as nx

# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# WordCloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import urllib
import requests
import matplotlib.pyplot as plt

In [2]:
frame = pd.read_csv("GenConfTalks.csv")
frame.fillna("NA", inplace = True)

In [3]:
Date = []
for m, y in zip(frame.Month, frame.Year):
    Date.append(date(y, m, 1))
frame['Date'] = Date

grouped_ = frame.groupby(['Date']).count()
grouped_.insert(0, 'ID', range(100, 100 + len(grouped_)))
frame.set_index(['Date'], inplace=True)

frame = pd.merge(frame, grouped_[['ID']], right_on='Date', left_index=True)
frame.reset_index(inplace = True)

In [6]:
stop = list(set(stopwords.words("english"))) + list([x for x in string.punctuation])
printable = set(string.printable)

In [7]:
def tokening(text): 
    token = word_tokenize(text)
    token = [word.lower() for word in token if word not in stop]
    return token

In [8]:
frame['Sentence'] = frame['Talk'].apply(lambda x: sent_tokenize(x))
frame['Tokens'] = frame['Talk'].apply(lambda x: tokening(x))
frame.head()

Unnamed: 0,Date,Year,Month,Speaker,Role,Title,Talk,ID,Sentence,Tokens
0,2019-04-01,2019,4,Ulisses Soares,Of the Quorum of the Twelve Apostles,How Can I Understand?,"My dear brothers and sisters, what a great joy...",196,"[My dear brothers and sisters, what a great jo...","[my, dear, brothers, sisters, great, joy, toge..."
1,2019-04-01,2019,4,Becky Craven,Second Counselor in the Young Women General Pr...,Careful versus Casual,"I once saw a sign in a store window that said,...",196,[I once saw a sign in a store window that said...,"[i, saw, sign, store, window, said, happiness,..."
2,2019-04-01,2019,4,Brook P. Hales,Of the Seventy,Answers to Prayer,An important and comforting doctrine of the go...,196,[An important and comforting doctrine of the g...,"[an, important, comforting, doctrine, gospel, ..."
3,2019-04-01,2019,4,Dieter F. Uchtdorf,Of the Quorum of the Twelve Apostles,Missionary Work: Sharing What Is in Your Heart,Last month the Twelve were invited by our dear...,196,[Last month the Twelve were invited by our dea...,"[last, month, twelve, invited, dear, prophet, ..."
4,2019-04-01,2019,4,W. Christopher Waddell,Second Counselor in the Presiding Bishopric,Just as He Did,"Approximately 18 months ago, in the fall of 20...",196,"[Approximately 18 months ago, in the fall of 2...","[approximately, 18, months, ago, fall, 2017, 6..."


In [9]:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

def lemm_tokes(texts):
    lems = [wn.lemmatize(text) for text in texts]
    return lems

def stem_tokes(texts):
    lems = [ps.stem(text) for text in texts]
    return lems

In [10]:
frame['Lem_Tokens'] = frame['Tokens'].apply(lambda x: lemm_tokes(x))
frame['Stem_Tokens'] = frame['Tokens'].apply(lambda x: stem_tokes(x))

In [21]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3839 entries, 0 to 3838
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         3839 non-null   object
 1   Year         3839 non-null   int64 
 2   Month        3839 non-null   int64 
 3   Speaker      3839 non-null   object
 4   Role         3839 non-null   object
 5   Title        3839 non-null   object
 6   Talk         3839 non-null   object
 7   ID           3839 non-null   int32 
 8   Sentence     3839 non-null   object
 9   Tokens       3839 non-null   object
 10  Lem_Tokens   3839 non-null   object
 11  Stem_Tokens  3839 non-null   object
 12  POS          3839 non-null   object
dtypes: int32(1), int64(2), object(10)
memory usage: 375.0+ KB


In [14]:
def sentiment_scores(sentence): 
  
    # Create a SentimentIntensityAnalyzer object. 
    analyzer = SentimentIntensityAnalyzer()
    # polarity_scores method of SentimentIntensityAnalyzer 
    # oject gives a sentiment dictionary. 
    # which contains pos, neg, neu, and compound scores. 
    sentiment_dict = analyzer.polarity_scores(sentence) 
    
    # decide sentiment as positive, negative and neutral 
    if sentiment_dict['compound'] >= 0.05 : 
        result = "Positive"
    elif sentiment_dict['compound'] <= - 0.05 : 
        result = "Negative" 
    else : 
        result = "Neutral"
        
    return sentiment_dict['compound']

def scoring(sents):
    scores = []
    for i in range(len(sents)):
        num = sentiment_scores(sents[i])
        scores.append(num)
    x = sum(scores)/len(scores)
    return x

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation

# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(frame['Talk'])

In [25]:
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 7
number_words = 10
# Create and fit the LDA model
lda = LatentDirichletAllocation(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=-1,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [20]:
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

Topics found via LDA:

Topic #0:
church president priesthood presidency quorum members general work lord prophet

Topic #1:
god lord christ jesus church shall unto holy gospel joseph

Topic #2:
church lord welfare people work members family poor relief services

Topic #3:
love church family lord temple young christ women gospel children

Topic #4:
god lord jesus life christ father unto faith love shall

Topic #5:
priesthood said young home men time lord father man years

Topic #6:
children god life family lord love world good time things


In [22]:
def calling_group(text):
    x = "NA"
    if re.search('First Presidency', text) or re.search('President of the Church', text):
        x = 'First Presidency'
    elif re.search('Twelve', text):
        x = 'Quorum of the Twelve'
    elif re.search('Seventy', text):
        x = "Quorum of the Seventy"
    elif re.search('Primary', text):
        x = "Primary"
    elif re.search('Young Men', text):
        x = "Young Men"
    elif re.search('Young Women', text):
        x = "Young Women"
    elif re.search('Relief', text):
        x = "Relief Society"
    elif re.search('Sunday School', text):
        x = "Sunday School"
    elif re.search('Bishop', text):
        x = "Bishopric"
    return x

In [23]:
frame['GroupedRole'] = frame['Role'].apply(lambda x: calling_group(x))

**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************
**********************************************************************************************************


In [49]:
nlp = en_core_web_lg.load()

In [50]:
text = "".join(frame['Sentence'][6])
doc = nlp(text)

In [51]:
def summary(doc):
    summary = ''
    ent_list = []
    
    for e in doc.ents:
        if e.label_ == "PERSON":
            ent_list.append(str(e).lower().strip())
            
    ent_list = list( dict.fromkeys(ent_list) )
    
    for ent in ent_list:
        sentences = textacy.extract.semistructured_statements(doc, ent)
        for statement in sentences:
            subject, verb, fact = statement
            if len(fact) > 1:
                print(ent)
                print(" - Fact: ", fact)
    return summary

In [52]:
summary(doc)

''

def print_ents(doc):
    sentence_spans = list(doc.sents)
    displacy.render(sentence_spans, style="ent")
    
print_ents(doc)

In [65]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import NMF

vectorizer = TfidfVectorizer(stop_words='english',analyzer='word', max_features=8000)
x_counts = vectorizer.fit_transform(frame["Talk"]);

transformer = TfidfTransformer(smooth_idf=False);
x_tfidf = transformer.fit_transform(x_counts);

xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [66]:
num_topics = 9

#obtain a NMF model.
model = NMF(n_components=num_topics, init='nndsvd');
#fit the model
model.fit(xtfidf_norm)

def get_nmf_topics(model):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict);

In [67]:
dframe = get_nmf_topics(model)

In [68]:
dframe

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09
0,na,proposed,god,auditing,emeritus,rededicated,uplifted,auditing,priesthood
1,hunt,favor,lord,audit,statistical,statistical,constituted,budgets,relief
2,fiery,manifest,shall,department,branches,31,authorities,certified,society
3,tutoring,opposed,ye,audits,territories,issued,manifest,assets,women
4,griefs,sustain,christ,controlled,statistics,december,favor,policies,family
5,sufferings,counselor,unto,accounting,31,operation,proposed,expenditures,church
6,bags,seventies,jesus,expenditures,seventy,mexico,general,departments,young
7,furnace,release,book,departments,wards,status,officers,audits,home
8,sicknesses,vote,ghost,financial,december,dedicated,changes,funds,welfare
9,realities,seventy,father,funds,issued,fort,opposed,approved,missionary
