In [1]:
import pandas as pd
import re
import numpy as np
import feather
import string
import os

dataset = input()
path = os.getcwd() + '/Datasets/'+dataset+'/'

banking


In [2]:
#df = pd.read_feather('Datasets/#fintech.csv')
df = pd.read_feather(path+dataset+'.ftr')
df.shape

(989984, 15)

In [3]:
if dataset == 'banking':
    df_old = pd.read_feather(path+dataset+'15-18.ftr')
    df = df.append(df_old, ignore_index = True)
    df_old = pd.read_feather(path+dataset+'13-14.ftr')
    df = df.append(df_old, ignore_index = True)
    df_old = pd.read_feather(path+dataset+'09-12.ftr')
    df = df.append(df_old, ignore_index = True)
df.shape

(2492606, 15)

# Data Cleaning

## Filter and Clean the Dataframe

In [4]:
#Next, mostly to get rid of bots and cryptocurrency warriors, 
#we are going to remove tweets that have the sum of likes and retweets less than 5:
df['sum_pubmetrics'] = df['public_metrics.retweet_count'] + df['public_metrics.like_count']
df = df[df['public_metrics.followers_count']>=100]
df.shape

(2286093, 16)

In [5]:
#Reset the index to start from 0 and drop the previous index columns
df = df.reset_index().drop(columns=['index'])
#df = df.reset_index().drop(columns=['index','Unnamed: 0'])

#Keep only the useful columns!
df = df.drop(columns = ['public_metrics.reply_count',
                        'public_metrics.quote_count',
                        'public_metrics.listed_count'])

In [6]:
# For banking create a dummy variable that shows whether the tweet contains the word/hashtag fintech
if dataset == 'banking':
    df['fintech'] = df.text.str.contains('fintech', case = False)

In [None]:
#df.to_feather(path+dataset+'_full.ftr')
#df.to_csv(path+dataset+'_full.csv')

### Clean the tweets

In [8]:
def clean_text_round1(text):
    """Initial cleaning I make to the dataset"""
    text = re.sub(r'@[A-Za-z0-9]+','',text) #Removes @mentions
    text = re.sub(r'#','',text) #Removes the '#' symbol
    text = re.sub(r'RT[\s]+', '', text) #Removing RT
    text = re.sub(r'&amp;','',text) #Removes &amp
    text = re.sub(r'https?:\/\/\S+','',text) #Remove the hyper link
    text = re.sub(r'\n','',text) #remove the new line symbol
    text = re.sub(r'\t','',text) #remove the tab symbol
    #Removes the punctuation:
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text) #remove words containing numbers
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text) #Removes special characters except space
    text = re.sub(' +', ' ', text).strip() #Removes trailing white spaces
    text = text.lower() # lowers the text

    return text

In [9]:
df['text'] = df['text'].apply(clean_text_round1)

Second round will try to correct typos, but not perfectly. I can remove words such as aaaaaaaand, aaaaaand, finnnntteeeechhh. 

In [10]:
def clean_text_round2(text):
    pattern = re.compile(r"(.)\1{2,}")
    text = pattern.sub(r"\1\1", text)
    return text

In [11]:
df['text'] = df['text'].apply(clean_text_round2)

In [12]:
df.to_feather(path+dataset+'_clean.ftr')

In [13]:
df.to_csv(path+dataset+'_clean.csv')

In [14]:
df = pd.read_feather(path+dataset+'_clean.ftr')

## Organize the Data

### Corpus

For corpus I will group by years and make separate corpuses/columns for different years.

In [15]:
# Set the index to be the date the tweet was created
df = df.set_index('tweet_created_at')
# Group the tweets by year and merge the tweets by separating them by space:
df = df.text.groupby(pd.Grouper(freq='Y')).apply(' '.join)
# Rename the index to year ending in:
df = df.rename_axis('year_ending_in')
# Save it as a corpus:
#df.to_frame().reset_index().to_feather('Datasets/#'+dataset+'_corpus.ftr')
df.to_pickle(path+dataset+'_corpus.pkl')

### Document-Term Matrix

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(df)

In [18]:
# Create a pandas dataframe: columns are the words appearing in the tweets,
# rows are showing the frequency of these words
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
# Set the index to be dates:
data_dtm.index = df.index

print('The shape of the document-term matrix is:')
print(data_dtm.shape)

The shape of the document-term matrix is:
(4, 671255)


In [19]:
# Save the dtm:
data_dtm.transpose().to_pickle(path+dataset+'_dtm.pkl')