In [86]:
import nltk
import pandas as pd
from IPython.display import display
nltk.download('vader_lexicon')
nltk.download('punkt')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
pd.set_option('display.max_colwidth', None)
stop_words = stopwords.words("english")
pd.set_option("max_rows", None)
import itertools

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [74]:
def reformat_data(df):
    list_df = []
    for i, row in df.iterrows():
        row_list = str(row).strip(f'Name: {i}, dtype: object').strip('Field1').strip('\n').strip(' ').split('\\n')
        if len(row_list) < 5:
            continue
        if row_list[4] == 'Replying to ':
            continue
        while '' in row_list:
            row_list.remove('')
        del row_list[0] #delete name (we're using the handle)
        del row_list[1] #delete dot
        end = -1
        for i in (-3, -2, -1):
            if (row_list[i].isnumeric()):
                end = i
                break
        content_list = row_list[2:end]
        content = ''.join(i for i in content_list)
        done_list = row_list[:2]+row_list[end:]+[None]*(end+3)+[content]
        if len(done_list[1]) < 8: #if there is no year
            done_list[1] += ' 2020'
        while len(done_list) < 6:
            done_list.insert(-2, None)
        list_df.append(done_list)
    return list_df

In [61]:
list_df = reformat_data(df)

In [62]:
main_df = pd.DataFrame(list_df, columns=["Name", "Date", "Num_comments", "Num_retweets", "Num_likes", "Content"])

In [63]:
#define function that applies sid and returns the compound score
def analyse(text):
    scores = sid.polarity_scores(text)
    return scores["compound"]

In [64]:
VADER_compound_score_list = []
for text in main_df["Content"]:
    VADER_compound_score_list.append(analyse(text))

In [65]:
#VADER was made as a sentiment analysis tool. we can use it to filter out possible heresy/non chrisitan posts
main_df["VADER_score"] = VADER_compound_score_list

In [66]:
def get_adjectives_dict(df, stop_words):
    lemmed_adj_dict = {}
    #original_adj_dict = {}
    for sentence in df["Content"]:
        tokens = nltk.word_tokenize(sentence)
        lemmed_tokens = []
        for token in tokens:
            if token not in stop_words:
                lemmed_tokens.append(wordnet_lemmatizer.lemmatize(token))
        lemmed_tagged = nltk.pos_tag(lemmed_tokens)
        #original_tagged = nltk.pos_tag(tokens)
        for tag_set in lemmed_tagged:
            if 'JJ' in tag_set[1]:
                adj = ''.join([i for i in tag_set[0] if i.isalpha()])
                if adj not in lemmed_adj_dict:
                    lemmed_adj_dict[adj] = 1
                else:
                    lemmed_adj_dict[adj] += 1
    lemmed_adj_sorted_dict = {}
    lemmed_adj_sorted_keys = sorted(lemmed_adj_dict, key=lemmed_adj_dict.get, reverse=True)
    for w in lemmed_adj_sorted_keys:
        lemmed_adj_sorted_dict[w] = lemmed_adj_dict[w]
    del lemmed_adj_sorted_dict['']
    return lemmed_adj_sorted_dict

In [67]:
#this shows the adjectives and freq of appearance in posts containing the word jesus for this corpus
lemmed_adj_sorted_dict = get_adjectives_dict(main_df, stop_words)
lemmed_adj_sorted_dict

{'u': 123,
 'good': 31,
 'unto': 24,
 'give': 20,
 'many': 18,
 'new': 18,
 'eternal': 17,
 'great': 16,
 'dead': 15,
 'true': 14,
 'Good': 12,
 'able': 11,
 'hear': 11,
 'much': 10,
 'understanding': 10,
 'last': 10,
 'Christian': 10,
 'better': 10,
 'guard': 9,
 'live': 9,
 'perfect': 9,
 'separate': 9,
 'thy': 9,
 'present': 8,
 'right': 8,
 'free': 8,
 'beautiful': 8,
 'open': 8,
 'strong': 8,
 'possible': 8,
 'impossible': 8,
 'best': 8,
 'anxious': 7,
 'ready': 7,
 'white': 7,
 'happy': 7,
 'human': 7,
 'Happy': 7,
 'little': 7,
 'first': 7,
 'second': 7,
 'pray': 7,
 'faith': 7,
 'glorious': 7,
 'whole': 7,
 'precious': 7,
 'full': 6,
 'light': 6,
 'disciple': 6,
 'know': 6,
 'faithful': 6,
 'else': 5,
 'wonderful': 5,
 'old': 5,
 'high': 5,
 'bad': 5,
 'guilty': 5,
 'sin': 5,
 'different': 5,
 'flesh': 5,
 'gospel': 5,
 'touch': 5,
 'worthy': 5,
 'thou': 5,
 'belong': 4,
 'angel': 4,
 'request': 4,
 'enough': 4,
 'worship': 4,
 'glory': 4,
 'Most': 4,
 'real': 4,
 'lead': 4,
 '

In [69]:
top_20_adj_dict = dict(itertools.islice(lemmed_adj_sorted_dict.items(), 20))
top_20_adj_df = pd.DataFrame.from_dict(top_20_adj_dict, columns=["BLESS"], orient='index')

Unnamed: 0,BLESS
u,123
good,31
unto,24
give,20
many,18
new,18
eternal,17
great,16
dead,15
true,14


In [125]:
def get_lemmed_sorted_dict(df):
    list_df = reformat_data(df)
    main_df = pd.DataFrame(list_df, columns=["Name", "Date", "Num_comments", "Num_retweets", "Num_likes", "Content"])
    VADER_compound_score_list = []
    for text in main_df["Content"]:
        VADER_compound_score_list.append(analyse(text))
    main_df["VADER_score"] = VADER_compound_score_list
    lemmed_adj_sorted_dict = get_adjectives_dict(main_df, stop_words)
    return lemmed_adj_sorted_dict, main_df

In [141]:
#full df with duplicates removed
full_df_list = []
for THEME in theme_name_list:
    file_path = "C:/Users/user/OneDrive - Singapore University of Technology and Design/Documents/SUTD/DH/Final Project/data/"
    file_name = f"{THEME}_2018-2020.csv"
    df = pd.read_csv(file_path+file_name)
    df["Theme"] = [THEME for i in range(len(df))]
    full_df_list.append(df)
full_df = pd.concat(full_df_list, axis=0)
#print(len(full_df))
full_df.drop_duplicates(keep="first", inplace=True)
#print(len(full_df))
lemmed_sorted_dict, main_df = get_lemmed_sorted_dict(full_df)

In [120]:
#top_10_adj_aggregate_dict = dict(itertools.islice(sorted_dict.items(), 20))
above_100_sorted_dict = {}
for key, val in lemmed_sorted_dict.items():
        if lemmed_sorted_dict[key] >= 100:
            above_100_sorted_dict[key] = lemmed_sorted_dict[key]
            
above_100_df = pd.DataFrame.from_dict(above_100_sorted_dict, columns=['Total occurances'], orient='index')

Unnamed: 0,Total occurances
u,2364
good,549
give,346
great,311
new,286
many,279
free,268
eternal,235
true,219
forgive,218


In [121]:
above_100_df.to_csv(f'above_100_df.csv')

In [142]:
main_df.sample(10)
#7108
#7204
#10393
#7725

Unnamed: 0,Name,Date,Num_comments,Num_retweets,Num_likes,Content,VADER_score
11059,† Sean †,@Saved_Sean_,2,14,68\nTheme SAVE,"·Jan 29Jesus is EVERYTHING, you guys.If you don't have a relationship with your Saviour, Jesus Christ, you're missing out.Got Jesus? You got EVERYTHING you need.Don't got Jesus? You got NOTHING you need.",-0.3736
5109,@One_Of_His,"Feb 1, 2019",7,53,123\nTheme HOPE,"1 Thess 4:13-14 KJVBut I would not have you to be ignorant, brethren, concerning them which are asleep, that ye sorrow not, even as others which have no hope. For if we believe that Jesus died and rose again, even so them also which sleep in Jesus will God bring with him.",-0.743
2487,@ArefMashaliUK,Jun 28 2020,12,25,92\nTheme GRACE,"Holy Father, please soften my heart and toughen my resolve so that I can love as Jesus loved. Make my life redemptive, even to those who oppose, abhor, ridicule, and hate me. Use me, dear Father, to bring others to the grace of Jesus. In his name I pray. Amen.",0.885
8195,@peero007,Jan 29 2020,6,11,Show this thread\nTheme PEACE,"Now may the God of peace, that brought again from the dead our Lord Jesus, that great shepherd of the sheep, through the blood of the everlasting covenant, make us perfect in every good work to do His will, working in us what is well pleasing in His eyes, through Jesus Christ...1",0.9477
1683,@rabbicraig,Jan 31 2020,7,22\nTheme GOD_SOVEREIGN,,Its one thing to know the Sabbath n its another thing to know the Lord of the Sabbath... true sabbath rest is given ONLY by Jesus Christ and fully experienced in Jesus... #happysabbath,0.4215
8321,@SirMakhubo,Dec 1 2020,13,63\nTheme PEACE,,"00:00. Let's pray Congratulations, you've made it to month 12/12: December God,Thank You for guiding me to yet another month. Now may You bless my December 1-31 with peace, strength, wisdom, opportunities, spiritual, mental & financial expansion.In Jesus name. Amen",0.967
2267,@Burning___Bush,"Jun 30, 2018",2,27,67\nTheme GOD_SOVEREIGN,"Jesus is coming soon! And time is short.If you don't see God working in your life, you need to examine yourself as to whether you are saved and of the faith (2Cor 13:5).We must be honest with ourselves. Where the Spirit of God resides, things are happening.",0.8055
6795,Ozil's bae,· 2020,14,7,103\nTheme MERCY,"Nov 21A leaf is glorious while on the tree, but the moment it drops, it loses its glory. You will never drop from God's divine grace, mercy, favour and protection in Jesus’ name.",0.973
14841,@LorenaG_82,"Jan 31, 2018",1\nTheme PROVIDE,,,"Lawd Jesus, please give me the strength to stay sober during the train wreck that is about to ensue on national television #FixItJesus #SOTU",0.3818
9991,Beatlemaniac2015,@Beatlemaniac201,1,4,13\nTheme PROTECT,"·Jan 27, 2019God and Jesus we pray that you watch over the Men and Women who are out keeping the city safe. Please keep them safe and bring them home to their families. In Jesus Name Amen. #BackTheBlue #BlueFamilyPrayers",0.8555
