In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk
#necessary tools
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import numpy as np
import string
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qianyuyang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/qianyuyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/qianyuyang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/qianyuyang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#opening file, please change directory
twitter = pd.read_csv('../data/twitter_raw.csv')

In [3]:
twitter.head()

Unnamed: 0.1,Unnamed: 0,time,content,search_key
0,0,2023-06-01T14:04:44.000Z,""" #EuropeanCommission Responds to ESA's Quest...","""sustainablefinance""AND""definition"""
1,1,2023-06-01T14:04:44.000Z,Sie werden keinen normalen Rucksack mehr trage...,"""sustainablefinance""AND""definition"""
2,2,2023-06-04T22:00:27.000Z,Definitions of key sustainable investment conc...,"""sustainablefinance""AND""definition"""
3,3,2023-06-12T20:00:21.000Z,link. \n\nReal estate groups are calling for c...,"""sustainablefinance""AND""definition"""
4,4,2023-06-01T10:30:52.000Z,Common definition of #greenwashing published...,"""sustainablefinance""AND""definition"""


In [4]:
#generating stopwords
stop_words_2 = set(stopwords.words('English'))
ps = PorterStemmer()
wn = WordNetLemmatizer()
punctuation_set = set(string.punctuation)
customized_stopwords = set(['hi','hello','hey'])
stop_words = stop_words_2 | punctuation_set | customized_stopwords
#storing stop words list. Uncomment the next line if you need it.
#pd.Series(list(stop_words)).to_csv('./stop_words.csv')

In [5]:
#input: treebank_tag: a word
#output: a woednet object, will be used later in lemmatizing
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  #

In [6]:
#input: sentence: a sentence, usually an str; lower: bool. set it True if you wish to lower the sentence.
#output: a list. Each element inside is a tuple containing a word and its part of speech.

def pos_tagging(sentence,lower = True):
    if lower:
        tokens = nltk.word_tokenize(sentence.lower())  # tokenizing
    else:
        tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)  # part-of-speech tagging
    return tagged


#there could be some words that are not recognized. In this case their part-of-speech would be unknown or ''(empty).
#These tags need to be standardized
def replace_unknown_pos(tagged):
    for i, (word, pos) in enumerate(tagged):
        if pos == '' or pos == 'unknown':
            tagged[i] = (word, 'unknown')
    return tagged

# an example. uncomment the next 3 lines if you want to see it.
#sentence = "This is a sample sentence."
#tagged_words = pos_tagging(sentence)
#tagged_words = replace_unknown_pos(tagged_words)



In [7]:
unknowns = []
#input: sentence: a sentence, usually an str; noun_only: bool, set it True if you wish only work with nouns in the sencence.
#output: lemmatized and tokenized and filtered(if noun_only == True) sentences, stored in a list.
#the list unknowns is global, storing all unknown words. You can choose to add them back or not in later analysis.
def wash(sentence : str,noun_only = True):
    global unknowns
    if not pd.isna(sentence):
        pos_tags = pos_tagging(sentence)
        pos_tags = replace_unknown_pos(pos_tags)
        unknown_s =[word for word,pos in pos_tags if pos == 'unknown'] 
        unknowns = unknown_s + unknowns
        known_s = [(word,pos) for word,pos in pos_tags if not pos == 'unknown']

        #If we are working with nouns or not.
        if noun_only:
            noun = [(word,pos) for word,pos in known_s if pos.startswith('N')]
            lemmatized_sentence = [wn.lemmatize(w,pos = get_wordnet_pos(pos_tag)) for w,pos_tag in noun]
        else:
            lemmatized_sentence = [wn.lemmatize(w,pos = get_wordnet_pos(pos_tag)) for w,pos_tag in known_s]
        filtered_sentence =  [w for w in lemmatized_sentence if not w in stop_words]
        return filtered_sentence
    return pd.NA
    


In [8]:
twitter = twitter.reset_index().drop(columns = ['index'])
#drop index numbers and reset a new one.

In [9]:
#extracting tag
twitter['hash_tags'] = twitter['content'].str.extract(r'#(\w+)\s')
#Changeing time format
#at position 189 there is a date stored in a different format, need to change it.
twitter.time[189] = '2013-05-30'
twitter['datetime'] = pd.to_datetime(twitter['time'])
twitter['date'] = [i.strftime('%Y-%m-%d') for i in twitter.datetime]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter.time[189] = '2013-05-30'


In [10]:
new_twitter = twitter

In [11]:
#tokenizing and lemmatizing sentences
new_twitter['content_all'] = [wash(i,False) for i in twitter['content']]
new_twitter['content_noun'] = [wash(i,noun_only = True) for i in twitter['content']]

In [12]:
new_twitter.head()
#our washed data

Unnamed: 0.1,Unnamed: 0,time,content,search_key,hash_tags,datetime,date,content_all,content_noun
0,0,2023-06-01T14:04:44.000Z,""" #EuropeanCommission Responds to ESA's Quest...","""sustainablefinance""AND""definition""",EuropeanCommission,2023-06-01 14:04:44+00:00,2023-06-01,"[``, europeancommission, respond, esa, 's, que...","[europeancommission, question, interpretation,..."
1,1,2023-06-01T14:04:44.000Z,Sie werden keinen normalen Rucksack mehr trage...,"""sustainablefinance""AND""definition""",,2023-06-01 14:04:44+00:00,2023-06-01,"[sie, werden, keinen, normalen, rucksack, mehr...","[sie, keinen, rucksack, mehr, tragen, liebe, d..."
2,2,2023-06-04T22:00:27.000Z,Definitions of key sustainable investment conc...,"""sustainablefinance""AND""definition""",sustainablefinance,2023-06-04 22:00:27+00:00,2023-06-04,"[definition, key, sustainable, investment, con...","[definition, investment, concept, report, fram..."
3,3,2023-06-12T20:00:21.000Z,link. \n\nReal estate groups are calling for c...,"""sustainablefinance""AND""definition""",ESG,2023-06-12 20:00:21+00:00,2023-06-12,"[link, real, estate, group, call, change, 'con...","[link, estate, group, change, sfdr, rule, defi..."
4,4,2023-06-01T10:30:52.000Z,Common definition of #greenwashing published...,"""sustainablefinance""AND""definition""",greenwashing,2023-06-01 10:30:52+00:00,2023-06-01,"[common, definition, greenwashing, publish, es...","[definition, esas, sustainablefinance, eutaxon..."


In [13]:
#uncomment the next line if you wish to store it.
#new_twitter.to_csv('../data/new_twitter.csv')