In [1]:
import pandas as pd
import numpy as np
import re

#import natural language processing toolkit libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

import emoji
from string import punctuation
from ast import literal_eval

In [2]:
df = pd.read_csv('Test Data.csv')
df.head(1)

Unnamed: 0,Label,Unnamed: 1,user,date,text,favorite_count,hashtags
0,2,7683,_DPone_,Fri May 08 17:40:59 +0000 2020,#Markets #Stocks $SPX $SPY $ES_F $IXIC $DJI $QQQ,0,"[{'text': 'Markets', 'indices': [0, 8]}, {'tex..."


In [3]:
df = df.drop(['Unnamed: 1','user','date'], axis=1)

# Initial Cleanup

In [4]:
#remove blank tweets
df = df.dropna(subset=['text'])

#filter to only cashtags or money
df = df[df.text.str.contains('\$')]

#clean up ampersands
df['text'] = df['text'].str.replace('&amp;', '&', case=False)

#Remove retweets
df = df[~df.text.str.contains('^RT', regex=True)]   # ~ (tilde) is INVERT - does the opposite

In [5]:
#Extract Emojis
def extract_emojis(abc):
    return ''.join(c for c in abc if c in emoji.UNICODE_EMOJI)

df['emoji']=df['text'].apply(extract_emojis)

In [6]:
#Extract Cashtags
a=[]
cashtagsrx = re.compile(r'\$[A-Z]{2,}')
for i in df['text']:
    a.append(cashtagsrx.findall(str(i)))
df['Cashtags'] = a

#remove rows without cashtags
df = df.dropna(subset=['Cashtags']) 

In [7]:
# Extract Hashtags and add to df

hashtag_list = []
for row in df['hashtags']:
    row = literal_eval(row) #issue with lists not being read as lists
    x = [x['text'] for x in row if 'text' in x] 
    hashtag_list.append(x)
    
df['hashtag_list'] = hashtag_list

In [8]:
#function to apply regex to column
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in range(len(r)):
        r[i] = '\\' + r[i]
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt    

# Remove text that isn't useful for classification

In [9]:
#cashtags 
df['clean_tweet'] = np.vectorize(remove_pattern)(df['text'], r'\$[A-Z]{2,}') 

#urls
df['clean_tweet'] = df['clean_tweet'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True) 

#lowercase tweet
df['clean_tweet'] = df['clean_tweet'].map(lambda x: x.lower()) 

#remove @name
df['clean_tweet'] = df['clean_tweet'].replace('@[^\s]+', '', regex=True)

#remove punctuation
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z]", " ") 

#remove short words
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) 

# Classification Prep

In [10]:
#create tokens - list of individual words rather than string
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: word_tokenize(x))

In [11]:
stopword = stopwords.words('english') #stopwords are words that aren't used in classification such as and, then, for

#function to remove these from a tokenised list
def remove_stopwords(tokenised_list):
    text = [word for word in tokenised_list if word not in stopword]
    return text

#apply function
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: remove_stopwords(x))

In [12]:
#remove blank tweets after preprocessing (eg tweet is only stopwords)
df = df.dropna(subset=['clean_tweet']) 

In [13]:
#tag the words for lemmatisation
#this adds a tag depending on the type of word - eg 'learn = verb' 
#this is because different word types need different changes to get back to the root word
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: pos_tag(x))

In [14]:
#function to lemmatize sentence - removing 'ing' 'ly' etc from the end of the word

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in tokens:
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

df['clean_tweet'] = df['clean_tweet'].apply(lambda x: lemmatize_sentence(x))

In [15]:
df = df.drop(['text','favorite_count','hashtags'], axis=1)

In [16]:
df['Label'] = df['Label'].replace(3, 0)

In [17]:
df.head() # check cleaned data

Unnamed: 0,Label,emoji,Cashtags,hashtag_list,clean_tweet
0,2,,"[$SPX, $SPY, $ES, $IXIC, $DJI, $QQQ]","[Markets, Stocks]","[market, stock]"
1,2,🇺🇸🇺🇸,"[$SPX, $SPY, $MSFT]",[stocks],"[earnings, growth, rate, among, large, stock, ..."
2,0,🇺🇸🇺🇸,"[$SPX, $SPY, $CCL, $WES]",[stocks],"[bad, perform, stock, carnival, corporation, w..."
3,2,👉,"[$SPY, $QQQ]","[Unemployment, Markets, stocks]","[unemployment, market, move, high, mean, stock]"
4,1,👉,"[$SPY, $SPX]","[recession2020, stocks]","[buyer, slowly, gain, momentum, recession, stock]"


In [18]:
df.to_csv('Tweets_clean.csv', index=False)

# Extra cleanup for VADER sentiment analysis

In [19]:
df_vader = df

In [20]:
#VADER needs a string not a list, so we recompile the list
def listCompile(row):
    s = ' '
    return(s.join(row))

df_vader['clean_tweet'] = df_vader['clean_tweet'].apply(lambda x: listCompile(x))
df_vader.head()
df_vader.to_csv('Tweets_clean_vader.csv', index=False)