In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re, nltk
from nltk.corpus import wordnet
import numpy as np
import pandas as pd

### Dictionaries for cleaning methods

The first dictionary includes contractions and their associated expansion.

In [3]:
Contraction_Dictionary1 = {
    "ain/t": "is not", "aren/t": "are not", "can/t": "can not", "can/t/ve": "can not have", "cause": "because", "could/ve": "could have",
    "couldn/t": "could not", "couldn/t/ve": "could not have", "didn/t": "did not", "doesn/t": "does not", "don/t": "do not", "hadn/t": "had not",
    "hadn/t/ve": "had not have", "hasn/t": "has not", "haven/t": "have not", "he/d": "he would", "he/d/ve": "he would have", "he/ll": "he will",
    "he/ll/ve": "he he will have", "he/s": "he is", "how/d": "how did", "how/d/y": "how do you", "how/ll": "how will", "how/s": "how is",
    "I/d": "I would", "I/d/ve": "I would have", "I/ll": "I will", "I/ll/ve": "I will have", "I/m": "I am", "I/ve": "I have", "i/d": "i would",
    "i/d/ve": "i would have", "i/ll": "i will", "i/ll/ve": "i will have", "i/m": "i am", "i/ve": "i have", "isn/t": "is not", "it/d": "it would",
    "it/d/ve": "it would have", "it/ll": "it will", "it/ll/ve": "it will have", "it/s": "it is", "let/s": "let us", "ma/am": "madam", "mayn/t": "may not",
    "might/ve": "might have", "mightn/t": "might not", "mightn/t/ve": "might not have", "must/ve": "must have", "mustn/t": "must not", "mustn/t/ve": "must not have",
    "needn/t": "need not", "needn/t/ve": "need not have", "o/clock": "of the clock", "oughtn/t": "ought not", "oughtn/t/ve": "ought not have", "shan/t": "shall not",
    "sha/n/t": "shall not", "shan/t/ve": "shall not have", "she/d": "she would", "she/d/ve": "she would have", "she/ll": "she will", "she/ll/ve": "she will have",
    "she/s": "she is", "should/ve": "should have", "shouldn/t": "should not", "shouldn/t/ve": "should not have", "so/ve": "so have", "so/s": "so as",
    "that/d": "that would", "that/d/ve": "that would have", "that/s": "that is", "there/d": "there would", "there/d/ve": "there would have",
    "there/s": "there is", "they/d": "they would", "they/d/ve": "they would have", "they/ll": "they will", "they/ll/ve": "they will have", "they/re": "they are",
    "they/ve": "they have", "to/ve": "to have", "wasn/t": "was not", "we/d": "we would", "we/d/ve": "we would have", "we/ll": "we will", "we/ll/ve": "we will have", 
    "we/re": "we are", "we/ve": "we have", "weren/t": "were not", "what/ll": "what will", "what/ll/ve": "what will have","what/re": "what are", "what/s": "what is", 
    "what/ve": "what have", "when/s": "when is", "when/ve": "when have", "where/d": "where did", "where/s": "where is", "where/ve": "where have",
    "who/ll": "who will", "who/ll/ve": "who will have", "who/s": "who is", "who/ve": "who have", "why/s": "why is", "why/ve": "why have", "will/ve": "will have", 
    "won/t": "will not","won/t/ve": "will not have", "would/ve": "would have", "wouldn/t": "would not", "wouldn/t/ve": "would not have", "y/all": "you all",
    "y/all/d": "you all would", "y/all/d/ve": "you all would have", "y/all/re": "you all are", "y/all/ve": "you all have", "you/d": "you would",
    "you/d/ve": "you would have", "you/ll": "you will", "you/ll/ve": "you will have", "you/re": "you are", "you/ve": "you have"
}


This list contains an edited list of stopwords, with all negation words (e.g. 'no', 'never', 'not') excluded.

In [4]:
stop_words =['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself',
            'yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself',
            'they','them','their','theirs','themselves','what','which','who','whom','this','that',
            'these','those','am','is','are','was','were','be','been','being','have','has','had',
            'having','do','does','did','doing','a','an','the','and','but','if','or','because','as',
            'until','while','of','at','by','for','with','about','against','between','into','through',
            'during','before','after','above','below','to','from','up','down','in','out','on','off',
            'over','under','again','further','then','once','here','there','when','where','why','how',
            'all','any','both','each','few','more','most','other','some','such',
            'only','own','same','so','than','too','very','can','will','just','should',
            'now','uses','use','using','used','one','also']

This second list contains the nltk.wordnet labelling convertion for verbs, adjectives, nouns and adverbs. The purpose of this list is to only lemmatize words that are POS (part-of-speech) tagged with these labels.

In [5]:
PosList =["JJ","JJR","JJS","NN","NNS","NNP","NNPS","RB",
          "RBR","RBS","VB","VBD","VBG","VBN","VBP","VBZ"]

This second dictionary uses the POS tag label as a key to refer to the root/lemma of a word. The purpose of this is to identify words with these POS tags and lemmatize them to their root lemma. E.g. 'running' --> 'run'

In [6]:
PosMapper = {
"JJ": wordnet.ADJ,
"JJR": wordnet.ADJ,
"JJS": wordnet.ADJ,
"NN": wordnet.NOUN,
"NNS": wordnet.NOUN,
"NNP": wordnet.NOUN,
"NNPS": wordnet.NOUN,
"RB": wordnet.ADV,
"RBR": wordnet.ADV,
"RBS": wordnet.ADV,
"VB": wordnet.VERB,
"VBD": wordnet.VERB,
"VBG": wordnet.VERB,
"VBN": wordnet.VERB,
"VBP": wordnet.VERB,
"VBZ": wordnet.VERB}

In [7]:
#Innitialize the lemmatizer
lemmatizer = WordNetLemmatizer()

### Text normalization/standardization method 

In [8]:
# This method normalizes the text into a coherent format for matching
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.lower() # Convert to lowercase
    df[text_field] = df[text_field].str.replace('http','') # removing urls is useful to make vocabulary small as possible
    df[text_field] = df[text_field].str.replace('com', '') # same as above.
    df[text_field] = df[text_field].str.replace(r"@\S+", " ")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9()$,!?@\`\"\'\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at") #  replacing at sign for a word
    df[text_field] = df[text_field].str.replace(".", " ")
    df[text_field] = df[text_field].str.replace(",", " ")
    df[text_field] = df[text_field].str.replace("-", " ")
    df[text_field] = df[text_field].str.replace("(", " ")
    df[text_field] = df[text_field].str.replace(")", " ")
    df[text_field] = df[text_field].str.replace('"', " ")
    df[text_field] = df[text_field].str.replace("?", "")
    df[text_field] = df[text_field].str.replace("!", "")
    return df

### Contractions Expansion Prep
In the data, contraction words such as wouldn't are noted as 'wouldn`t' ` which is a different character to the normal apostrophe. Therefore each instance is changed to a '/' in order to match contractions to the contraction dictionary equivalent.

In [9]:
# This method strips the ` and changes is to / in order to match contractions.
def contractionPrep(df, text_field):
    df[text_field] = df[text_field].str.lstrip(' ')
    df[text_field] = df[text_field].str.replace("'", '/')
    return df

### Contraction Expansion Method 

In [10]:
# This method expands all contractions to their original format
def expand_contractions(text, contraction_mapping=Contraction_Dictionary1):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

### Tokenisation Method 

In [11]:
def tokenizer(x):
    listOfTokens = []
    for text in x:
        text = str(text)
        text = word_tokenize(text)
        listOfTokens.append(text)
    return listOfTokens

### Singleletter removal Method
After an initial inspection into word frequency, single letter words were very frequent and didnt seem to contribute much semantic meaning the tweets, so were therefore removed.

In [12]:
def singleLetterRemoval(list_object):
    listOfTokens = []
    for tweet in list_object:
        temp = []
        for word in tweet:
            if len(word) > 1:
                temp.append(word)
        listOfTokens.append(temp)
    return listOfTokens

### Number Removal Method
Similarly to single letters, numbers dont contribute much meaning to the polarity of a tweet and so therefore removed.

In [13]:
def numberRemoval(list_object):
    listOfTokens = []
    for tweet in list_object:
        temp = []
        for word in tweet:
            if not word.isnumeric():
                temp.append(word)
        listOfTokens.append(temp)
    return listOfTokens

### Stopword Removal Method
Stopwords are the most frequent words in the corpus and only create noise for the classifier so were therefore removed.

In [14]:
def stopwordRemoval(list_object):
    listOfTokens = []
    for tweet in list_object:
        temp = []
        for word in tweet:
            if not word in stop_words:
                temp.append(word)
        listOfTokens.append(temp)
    return listOfTokens

### Lemmatization Method
Calls the Pos list and dictionary to return certain words into their root lemma format.

In [15]:
def lemma(list_object):
    tags = []
    for words in list_object:
        posTupples = nltk.pos_tag(words)
        text = [lemmatizer.lemmatize(k[0], pos=PosMapper.get(k[1])) if k[1] in PosList else k[0] for k in posTupples]
        tags.append(text)
    return tags

### Append securities 

In [16]:
def appendSecurities(list_object):
    listOfSecurities = []
    for tweet in list_object:
        temp = []
        sentence = tweet.split()
        for word in sentence:
            if re.fullmatch(r'\$[A-Z]{2,3}', word):
                temp.append(word)
        listOfSecurities.append(temp)
    return listOfSecurities

### Remove securities from content column

In [17]:
def removeSecurities(list_object):
    sentence_without_dollar_signs = []
    for tweet in list_object:
        temp = []
        sentence = tweet.split()
        for word in sentence:
            if not re.fullmatch(r'\$[A-Z]{2,3}', word):
                temp.append(word)
        sentence_without_dollar_signs.append(' '.join(temp))
    return sentence_without_dollar_signs

### Read in dataset and clean it.
In order to compare the raw dataset to the cleaned version, two datasets are created. 

In [None]:
# Read in data to clean
data = pd.read_csv(r'..\NLP_Web_Scraping\data\raw\scrapedtweets.csv')
data.head(3)

### Now remove securities from the dataset and add them to a new column

Use remove securities function from earlier and apply to the content column in the dataset. Then output to a new column.

In [20]:
#securities added to one column

data['content'] = data['content'].astype('str')
data['securities'] = appendSecurities(data['content'])
data.head(3)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]"
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY]
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]"


In [21]:
#use the content column to remove any securities from the tweet and add it to a new column
data['tweet text'] = removeSecurities(data['content'])
data.head(3)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...


In [22]:
data['tokens'] = data['tweet text']
data.head(5)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,travel going green bullish $NCLH
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,let’s go mooning today
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...,$DJIA Analysts on US stock markets: 1. On Mond...
3,OkieOkie,/OkieOkie/message/226381256,Bearish,$SPY more China. China wants some of Australia...,09/07/2020,12:21:03,[$SPY],more China. China wants some of Australia lol🦘🦘🦘🦘,more China. China wants some of Australia lol🦘🦘🦘🦘
4,risksavage_inthemarket,/risksavage_inthemarket/message/226381105,Bullish,$GNLN $CGC $SPY $KERN $PM “What Does The Insti...,09/07/2020,12:21:03,"[$CGC, $SPY, $PM]",$GNLN $KERN “What Does The Institutional Owner...,$GNLN $KERN “What Does The Institutional Owner...


In [23]:
# Standardize Text

data = standardize_text(data,'tokens')

data.head(5)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,travel going green bullish $nclh
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,let s go mooning today
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...,$djia analysts on us stock markets 1 on mond...
3,OkieOkie,/OkieOkie/message/226381256,Bearish,$SPY more China. China wants some of Australia...,09/07/2020,12:21:03,[$SPY],more China. China wants some of Australia lol🦘🦘🦘🦘,more china china wants some of australia lol
4,risksavage_inthemarket,/risksavage_inthemarket/message/226381105,Bullish,$GNLN $CGC $SPY $KERN $PM “What Does The Insti...,09/07/2020,12:21:03,"[$CGC, $SPY, $PM]",$GNLN $KERN “What Does The Institutional Owner...,$gnln $kern what does the institutional owner...


### Expand contractions
Only the dataset that is being cleaned calls these methods.

In [24]:
# Get data ready for Contraction Expansion
data = contractionPrep(data,'tokens')

In [25]:
# Expand Contractions
cleanedData = [expand_contractions(str(tweet)) for tweet in data['tokens']]
data['tokens'] = cleanedData
data.head(5)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,travel going green bullish $nclh
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,let s go mooning today
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...,$djia analysts on us stock markets 1 on mond...
3,OkieOkie,/OkieOkie/message/226381256,Bearish,$SPY more China. China wants some of Australia...,09/07/2020,12:21:03,[$SPY],more China. China wants some of Australia lol🦘🦘🦘🦘,more china china wants some of australia lol
4,risksavage_inthemarket,/risksavage_inthemarket/message/226381105,Bullish,$GNLN $CGC $SPY $KERN $PM “What Does The Insti...,09/07/2020,12:21:03,"[$CGC, $SPY, $PM]",$GNLN $KERN “What Does The Institutional Owner...,$gnln $kern what does the institutional owner...


After inspecting the results there were still some square brackets remaining as part of some words so these needed to be removed.

In [26]:
#strip remaining / from data
data['tokens'] = data['tokens'].str.replace('/', '')

### Tokenize Data
This is applied to both datasets.

In [27]:
# Tokenize Data
tweets = data['tokens'].tolist()
tokenizedData = tokenizer(tweets)
data['tokens'] = tokenizedData


In [28]:
data.head(2)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,"[travel, going, green, bullish, $, nclh]"
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,"[let, s, go, mooning, today]"


### Call rest of cleaning methods on the dataset that is being cleaned.

In [29]:
#single letter removal:

tweetData = data['tokens'].tolist()
slRemoved = singleLetterRemoval(tweetData)
data['tokens'] = slRemoved
data.head(4)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,"[travel, going, green, bullish, nclh]"
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,"[let, go, mooning, today]"
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...,"[djia, analysts, on, us, stock, markets, on, m..."
3,OkieOkie,/OkieOkie/message/226381256,Bearish,$SPY more China. China wants some of Australia...,09/07/2020,12:21:03,[$SPY],more China. China wants some of Australia lol🦘🦘🦘🦘,"[more, china, china, wants, some, of, australi..."


In [30]:
#number removal
# tweetData = data['tweet text'].tolist()
# nRemoved = numberRemoval(tweetData)
# data['tweet text'] = nRemoved

tweetData = data['tokens'].tolist()
nRemoved = numberRemoval(tweetData)
data['tokens'] = nRemoved
data.head(4)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,"[travel, going, green, bullish, nclh]"
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,"[let, go, mooning, today]"
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...,"[djia, analysts, on, us, stock, markets, on, m..."
3,OkieOkie,/OkieOkie/message/226381256,Bearish,$SPY more China. China wants some of Australia...,09/07/2020,12:21:03,[$SPY],more China. China wants some of Australia lol🦘🦘🦘🦘,"[more, china, china, wants, some, of, australi..."


In [31]:
import pandas as pd

data['num_of_tokens'] = data['tokens'].apply(lambda x: len(x))

data.head(10)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens,num_of_tokens
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,"[travel, going, green, bullish, nclh]",5
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,"[let, go, mooning, today]",4
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...,"[djia, analysts, on, us, stock, markets, on, m...",59
3,OkieOkie,/OkieOkie/message/226381256,Bearish,$SPY more China. China wants some of Australia...,09/07/2020,12:21:03,[$SPY],more China. China wants some of Australia lol🦘🦘🦘🦘,"[more, china, china, wants, some, of, australi...",8
4,risksavage_inthemarket,/risksavage_inthemarket/message/226381105,Bullish,$GNLN $CGC $SPY $KERN $PM “What Does The Insti...,09/07/2020,12:21:03,"[$CGC, $SPY, $PM]",$GNLN $KERN “What Does The Institutional Owner...,"[gnln, kern, what, does, the, institutional, o...",82
5,HeyYouWhoMe,/HeyYouWhoMe/message/226381022,Bullish,$SPY up or down today,09/07/2020,12:21:03,[$SPY],up or down today,"[up, or, down, today]",4
6,KaroleinTriedToTrade,/KaroleinTriedToTrade/message/226380585,Bullish,$SPY I hope this goes up so high,09/07/2020,12:21:03,[$SPY],I hope this goes up so high,"[hope, this, goes, up, so, high]",6
7,DannETrader,/DannETrader/message/226380472,Bearish,$SPY yesterday was last day of FED repo’s. It ...,09/07/2020,12:21:03,[$SPY],yesterday was last day of FED repo’s. It isn’t...,"[yesterday, was, last, day, of, fed, repo, it,...",12
8,shoaibfatima,/shoaibfatima/message/226380359,Bullish,$SPY get in before the pump starts,09/07/2020,12:21:03,[$SPY],get in before the pump starts,"[get, in, before, the, pump, starts]",6
9,Burrrr_time,/Burrrr_time/message/226379857,Bullish,$SPY fed buying stock !!! Ath,09/07/2020,12:21:03,[$SPY],fed buying stock !!! Ath,"[fed, buying, stock, ath]",4


In [32]:
#stopword removal
tweetData = data['tokens'].tolist()
noiseRemoved = stopwordRemoval(tweetData)
data['tokens'] = noiseRemoved
data.head(4)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens,num_of_tokens
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,"[travel, going, green, bullish, nclh]",5
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,"[let, go, mooning, today]",4
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...,"[djia, analysts, us, stock, markets, monday, b...",59
3,OkieOkie,/OkieOkie/message/226381256,Bearish,$SPY more China. China wants some of Australia...,09/07/2020,12:21:03,[$SPY],more China. China wants some of Australia lol🦘🦘🦘🦘,"[china, china, wants, australia, lol]",8


### Lemmatize the cleaned Dataset 

In [33]:
#Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

#Lemmatize Data
tweetData = data['tokens'].tolist()
lemmatizedData = lemma(tweetData)
data['tokens_in_transformed_text'] = lemmatizedData
data.head(5)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens,num_of_tokens,tokens_in_transformed_text
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,"[travel, going, green, bullish, nclh]",5,"[travel, go, green, bullish, nclh]"
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,"[let, go, mooning, today]",4,"[let, go, moon, today]"
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...,"[djia, analysts, us, stock, markets, monday, b...",59,"[djia, analyst, us, stock, market, monday, bla..."
3,OkieOkie,/OkieOkie/message/226381256,Bearish,$SPY more China. China wants some of Australia...,09/07/2020,12:21:03,[$SPY],more China. China wants some of Australia lol🦘🦘🦘🦘,"[china, china, wants, australia, lol]",8,"[china, china, want, australia, lol]"
4,risksavage_inthemarket,/risksavage_inthemarket/message/226381105,Bullish,$GNLN $CGC $SPY $KERN $PM “What Does The Insti...,09/07/2020,12:21:03,"[$CGC, $SPY, $PM]",$GNLN $KERN “What Does The Institutional Owner...,"[gnln, kern, institutional, ownership, tell, u...",82,"[gnln, kern, institutional, ownership, tell, u..."


### See what the data looks like:

In [34]:
data.head(5) # cleanDataset

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens,num_of_tokens,tokens_in_transformed_text
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,"[travel, going, green, bullish, nclh]",5,"[travel, go, green, bullish, nclh]"
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,"[let, go, mooning, today]",4,"[let, go, moon, today]"
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...,"[djia, analysts, us, stock, markets, monday, b...",59,"[djia, analyst, us, stock, market, monday, bla..."
3,OkieOkie,/OkieOkie/message/226381256,Bearish,$SPY more China. China wants some of Australia...,09/07/2020,12:21:03,[$SPY],more China. China wants some of Australia lol🦘🦘🦘🦘,"[china, china, wants, australia, lol]",8,"[china, china, want, australia, lol]"
4,risksavage_inthemarket,/risksavage_inthemarket/message/226381105,Bullish,$GNLN $CGC $SPY $KERN $PM “What Does The Insti...,09/07/2020,12:21:03,"[$CGC, $SPY, $PM]",$GNLN $KERN “What Does The Institutional Owner...,"[gnln, kern, institutional, ownership, tell, u...",82,"[gnln, kern, institutional, ownership, tell, u..."


In [35]:
import pandas as pd

data['num_of_tokens_in_transformed_text'] = data['tokens_in_transformed_text'].apply(lambda x: len(x))

data.head(10)

Unnamed: 0,user,message_id,sentiment,content,date,time,securities,tweet text,tokens,num_of_tokens,tokens_in_transformed_text,num_of_tokens_in_transformed_text
0,babybounce,/babybounce/message/226382374,Bullish,$BA travel going green bullish $CCL $RCL $NCLH...,09/07/2020,12:21:03,"[$BA, $CCL, $RCL, $SPY]",travel going green bullish $NCLH,"[travel, going, green, bullish, nclh]",5,"[travel, go, green, bullish, nclh]",5
1,L1_Trading,/L1_Trading/message/226381562,Bullish,$SPY let’s go mooning today,09/07/2020,12:21:03,[$SPY],let’s go mooning today,"[let, go, mooning, today]",4,"[let, go, moon, today]",4
2,Economist4401,/Economist4401/message/226381511,Bearish,$SPY $SPX $DJIA $DIA $QQQ Analysts on US stock...,09/07/2020,12:21:03,"[$SPY, $SPX, $DIA, $QQQ]",$DJIA Analysts on US stock markets: 1. On Mond...,"[djia, analysts, us, stock, markets, monday, b...",59,"[djia, analyst, us, stock, market, monday, bla...",38
3,OkieOkie,/OkieOkie/message/226381256,Bearish,$SPY more China. China wants some of Australia...,09/07/2020,12:21:03,[$SPY],more China. China wants some of Australia lol🦘🦘🦘🦘,"[china, china, wants, australia, lol]",8,"[china, china, want, australia, lol]",5
4,risksavage_inthemarket,/risksavage_inthemarket/message/226381105,Bullish,$GNLN $CGC $SPY $KERN $PM “What Does The Insti...,09/07/2020,12:21:03,"[$CGC, $SPY, $PM]",$GNLN $KERN “What Does The Institutional Owner...,"[gnln, kern, institutional, ownership, tell, u...",82,"[gnln, kern, institutional, ownership, tell, u...",47
5,HeyYouWhoMe,/HeyYouWhoMe/message/226381022,Bullish,$SPY up or down today,09/07/2020,12:21:03,[$SPY],up or down today,[today],4,[today],1
6,KaroleinTriedToTrade,/KaroleinTriedToTrade/message/226380585,Bullish,$SPY I hope this goes up so high,09/07/2020,12:21:03,[$SPY],I hope this goes up so high,"[hope, goes, high]",6,"[hope, go, high]",3
7,DannETrader,/DannETrader/message/226380472,Bearish,$SPY yesterday was last day of FED repo’s. It ...,09/07/2020,12:21:03,[$SPY],yesterday was last day of FED repo’s. It isn’t...,"[yesterday, last, day, fed, repo, isn, fed, an...",12,"[yesterday, last, day, feed, repo, isn, feed, ...",8
8,shoaibfatima,/shoaibfatima/message/226380359,Bullish,$SPY get in before the pump starts,09/07/2020,12:21:03,[$SPY],get in before the pump starts,"[get, pump, starts]",6,"[get, pump, start]",3
9,Burrrr_time,/Burrrr_time/message/226379857,Bullish,$SPY fed buying stock !!! Ath,09/07/2020,12:21:03,[$SPY],fed buying stock !!! Ath,"[fed, buying, stock, ath]",4,"[fed, buying, stock, ath]",4


### Export dataframes to csv files: 

In [37]:
#Export to CSV
data.to_csv(r'..\NLP_Web_Scraping\notebooks\12-cleaning-raw-dataset\Datasets\cleanedData.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '..\\NLP_Web_Scraping\\notebooks\\12-cleaning-raw-dataset\\Datasets\\cleanedData.csv'