In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk
#necessary tools
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import numpy as np
import string
import matplotlib.pyplot as plt
from datetime import datetime
import re
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qianyuyang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/qianyuyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/qianyuyang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/qianyuyang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
hot = pd.read_csv('../data/Reddit_raw.csv')
#reading raw data. Remember to change your directory
hot = hot.rename(columns={
    'title' : 'title_original',
    'content': 'content_original',
    'comment': 'comment_original'
})
hot.head()

Unnamed: 0,title_original,content_original,comment_original,time,accurate_time,label
0,Think Like a Climate Investor #1 [Series],Good investors (us) should always be modeling ...,Does this understanding of additionality accou...,1 day ago,"['Fri, May 05, 2023, 04:14:08 PM Central Europ...",
1,Scan the globe for the best stocks to invest i...,,,Promoted,,
2,What is the best certificate/masters out there...,I currently work in the sustainability finance...,"\nWhat’s your job now?\nI work in finance, par...",1 day ago,"['Fri, May 05, 2023, 11:37:52 AM Central Europ...",Discussion / Question
3,Congrats to those that passed the CESGA March ...,CESGA just announced the results for the CESGA...,"I have no idea how I passed it, but it was a g...",4 days ago,"['Tue, May 02, 2023, 04:45:13 PM Central Europ...",Careers
4,Solar and Wind PPA Prices Surge in North Ameri...,,,5 days ago,"['Mon, May 01, 2023, 06:44:27 PM Central Europ...",Markets


In [3]:
#generating stop words
stop_words_2 = set(stopwords.words('English'))
ps = PorterStemmer()
wn = WordNetLemmatizer()
punctuation_set = set(string.punctuation)
customized_stopwords = set(['hi','hello','hey'])
stop_words = stop_words_2 | punctuation_set | customized_stopwords
#storing stop words list. Uncomment if you need it.
#pd.Series(list(stop_words)).to_csv('./stop_words.csv')

In [4]:
#input: treebank_tag: a word
#output: a woednet object, will be used later in lemmatizing
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  
    
#input: sentence: a sentence, usually an str; lower: bool. set it True if you wish to lower the sentence.
#output: a list. Each element inside is a tuple containing a word and its part of speech.

def pos_tagging(sentence,lower = True):
    if lower:
        tokens = nltk.word_tokenize(sentence.lower())  # tokenizing
    else:
        tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)  # part-of-speech tagging
    return tagged

#there could be some words that are not recognized. In this case their part-of-speech would be unknown or ''(empty).
#These tags need to be standardized
def replace_unknown_pos(tagged):
    for i, (word, pos) in enumerate(tagged):
        if pos == '' or pos == 'unknown':
            tagged[i] = (word, 'unknown')
    return tagged


# an example. uncomment the next 3 lines if you want to see it.
#sentence = "This is a sample sentence."
#tagged_words = pos_tagging(sentence)
#tagged_words = replace_unknown_pos(tagged_words)

unknowns = []
#input: sentence: a sentence, usually an str; noun_only: bool, set it True if you wish only work with nouns in the sencence.
#output: lemmatized and tokenized and filtered(if noun_only == True) sentences, stored in a list.
#the list unknowns is global, storing all unknown words. You can choose to add them back or not in later analysis.
def wash(sentence : str,is_noun = True):
    global unknowns
    if not pd.isna(sentence):
        pos_tags = pos_tagging(sentence)
        pos_tags = replace_unknown_pos(pos_tags)
        unknown_s =[word for word,pos in pos_tags if pos == 'unknown'] 
        unknowns = unknown_s + unknowns
        known_s = [(word,pos) for word,pos in pos_tags if not pos == 'unknown']

        #If we are working with nouns or not.
        noun = [(word,pos) for word,pos in known_s if pos.startswith('N')]
        if is_noun:
            lemmatized_sentence = [wn.lemmatize(w,pos = get_wordnet_pos(pos_tag)) for w,pos_tag in noun]
        else:
            lemmatized_sentence = [wn.lemmatize(w,pos = get_wordnet_pos(pos_tag)) for w,pos_tag in known_s]
        filtered_sentence =  [w for w in lemmatized_sentence if not w in stop_words]
        return filtered_sentence
    return pd.NA
    
def convert_time(date_str : str):
    pattern = r'\w+, \w+ \d+, \d{4}, \d{2}:\d{2}:\d{2} [AP]M'
    empty_time = '[]'
    if not pd.isna(date_str):
        if not date_str == empty_time:
            match = re.search(pattern, date_str)
            if match:
                extracted_date = match.group()
                datetime_obj = datetime.strptime(extracted_date, '%a, %b %d, %Y, %I:%M:%S %p')
                return datetime_obj
            else:
                return date_str
        else: return pd.NA
    else:
        return pd.NA


In [5]:
new_hot = pd.DataFrame()
unknowns = []

In [6]:
#lemmatizing all words
def convert_dataframe(hot, is_noun = True):
    new_hot = pd.DataFrame()
    tag = '_noun' if is_noun else '_all'
    for c in hot.columns:
        if not c in ['label','accurate_time']:
            parts = c.split('_')
            new_name = parts[0] + tag
            new_hot[new_name] = hot[c].apply(wash,is_noun = is_noun)
        else: new_hot['datetime'] = hot['accurate_time'].apply(convert_time) #converting time
    new_hot['date'] = [pd.NA if pd.isna(t) else t.strftime("%Y-%m-%d") for t in new_hot['datetime']]
    return new_hot

new_hot_noun = convert_dataframe(hot,True)
new_hot_all = convert_dataframe(hot,False)
#putting all three dataframe together.
concatenated_df =  pd.concat([hot,new_hot_noun,new_hot_all],axis = 1)
concatenated_df = concatenated_df.loc[:, ~concatenated_df.columns.duplicated()]

In [7]:
#reset the order of columns
reddit = concatenated_df.reindex(columns= [
    'title_original', 'content_original', 'comment_original','time','accurate_time', 'label', 'title_noun', 'content_noun', 'comment_noun', 'title_all', 'content_all', 'comment_all','datetime', 'date'
    ])
reddit.head()
#storing the data. Uncomment if you need it.
#reddit.to_csv('../data//Reddit.csv')

Unnamed: 0,title_original,content_original,comment_original,time,accurate_time,label,title_noun,content_noun,comment_noun,title_all,content_all,comment_all,datetime,date
0,Think Like a Climate Investor #1 [Series],Good investors (us) should always be modeling ...,Does this understanding of additionality accou...,1 day ago,"['Fri, May 05, 2023, 04:14:08 PM Central Europ...",,"[climate, investor, series]","[investor, future, section, ira, incentive, hy...","[understanding, additionality, account, renewa...","[think, like, climate, investor, 1, series]","[good, investor, u, always, model, possible, f...","[understanding, additionality, account, renewa...",2023-05-05 16:14:08,2023-05-05
1,Scan the globe for the best stocks to invest i...,,,Promoted,,,"[globe, stock, globalanalyst, capital, risk]",,,"[scan, globe, best, stock, invest, globalanaly...",,,,
2,What is the best certificate/masters out there...,I currently work in the sustainability finance...,"\nWhat’s your job now?\nI work in finance, par...",1 day ago,"['Fri, May 05, 2023, 11:37:52 AM Central Europ...",Discussion / Question,"[certificate/masters, credential, finance, car...","[sustainability, finance, industry, cert/maste...","[job, work, finance, project, finance, develop...","[best, certificate/masters, give, credential, ...","[currently, work, sustainability, finance, ind...","[’, job, work, finance, particularly, project,...",2023-05-05 11:37:52,2023-05-05
3,Congrats to those that passed the CESGA March ...,CESGA just announced the results for the CESGA...,"I have no idea how I passed it, but it was a g...",4 days ago,"['Tue, May 02, 2023, 04:45:13 PM Central Europ...",Careers,"[congrats, cesga, march, exam]","[cesga, result, cesga, exam, march, congrats]","[idea, relief, heard, designation, curriculum,...","[congrats, pass, cesga, march, exam]","[cesga, announce, result, cesga, exam, march, ...","[idea, pass, great, relief, haha, ’, never, he...",2023-05-02 16:45:13,2023-05-02
4,Solar and Wind PPA Prices Surge in North Ameri...,,,5 days ago,"['Mon, May 01, 2023, 06:44:27 PM Central Europ...",Markets,"[ppa, price, america, europe, see, price, drop...",,,"[solar, wind, ppa, price, surge, north, americ...",,,2023-05-01 18:44:27,2023-05-01
