# Elon Musk Tweets

## Libraries

In [42]:
import glob
import os
import pandas as pd

## 1. Get the data from the /data repository, combine in one

#### Define path

In [63]:
path = r'../data/'
the_data_out = r'../outputs/'

#### Combine datasets

In [3]:
all_files = glob.glob(os.path.join(path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent

df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=False)

In [4]:
concatenated_df.head() 

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,...,reply_to,retweet_date,translate,trans_src,trans_dest,time,mentions,replies_count,retweets_count,likes_count
0,0.0,15434727182,15434727182,1275676317000.0,2010-06-04 18:31:57,0,,"Please ignore prior tweets, as that was someon...",en,[],...,[],,,,,,,,,
0,0.0,152153637639028736,152151847614943233,1325111228000.0,2011-12-28 22:27:08,0,,@TheOnion So true :),en,[],...,[],,,,,,,,,
1,1.0,151809315026636800,151809315026636800,1325029135000.0,2011-12-27 23:38:55,0,,If you ever wanted to know the *real* truth ab...,en,[],...,[],,,,,,,,,
2,2.0,151338939389706242,151338939389706242,1324916990000.0,2011-12-26 16:29:50,0,,Walked around a neighborhood recently rebuilt ...,en,[],...,[],,,,,,,,,
3,3.0,151337237429239808,151337237429239808,1324916584000.0,2011-12-26 16:23:04,0,,"It was Xmas, so we brought presents for the ki...",en,[],...,[],,,,,,,,,


## 2. Remove columns that will not be used

We will be keeping date, timezone, **tweet**, hashtags, mentions, replies_count, retweets_count, likes_count. Only **tweet** variable is for nlp but others might be used for exploratory analysis. Missing values are replaced with zero and date will be separated into year, month, time column.

In [5]:
the_data_col = concatenated_df[['date', 'timezone', 'hashtags', 'mentions', 'replies_count', 'retweets_count', 'likes_count', 'tweet']]

In [6]:
the_data_col.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34878 entries, 0 to 1027
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            34878 non-null  object 
 1   timezone        34878 non-null  int64  
 2   hashtags        34878 non-null  object 
 3   mentions        4143 non-null   object 
 4   replies_count   4143 non-null   float64
 5   retweets_count  4143 non-null   float64
 6   likes_count     4143 non-null   float64
 7   tweet           34878 non-null  object 
dtypes: float64(3), int64(1), object(4)
memory usage: 2.4+ MB


In [12]:
#the_data_col['date'] = pd.to_datetime(the_data_col['date'])
#the_data_col['date'] = the_data_col['date'].dt.strftime('%Y/%d/%m')
#the_data_col['time'] = the_data_col['date'].dt.strftime('%H:%M')

In [10]:
the_data = the_data_col.fillna(0).replace('[]',0)[['date', 'timezone', 'hashtags', 'mentions', 'replies_count', 'retweets_count', 'likes_count', 'tweet']]

In [11]:
the_data

Unnamed: 0,date,timezone,hashtags,mentions,replies_count,retweets_count,likes_count,tweet
0,2010/04/06,0,0,0,0.0,0.0,0.0,"Please ignore prior tweets, as that was someon..."
0,2011/28/12,0,0,0,0.0,0.0,0.0,@TheOnion So true :)
1,2011/27/12,0,0,0,0.0,0.0,0.0,If you ever wanted to know the *real* truth ab...
2,2011/26/12,0,0,0,0.0,0.0,0.0,Walked around a neighborhood recently rebuilt ...
3,2011/26/12,0,0,0,0.0,0.0,0.0,"It was Xmas, so we brought presents for the ki..."
...,...,...,...,...,...,...,...,...
1023,2022/03/01,400,0,0,25611.0,51383.0,473530.0,https://t.co/LA9hPzVlGx
1024,2022/02/01,400,0,0,22500.0,26951.0,320201.0,Let’s make the roaring 20’s happen!
1025,2022/02/01,400,0,0,5630.0,4459.0,66405.0,Great work by Tesla team worldwide!
1026,2022/01/01,400,0,0,1074.0,472.0,45704.0,@BLKMDL3 @Tesla 🔥


## 3. Cleaning the text data

**a) We have to keep the hashtags and the mentions alive, so they will be extracted in separate columns and then removed from the tweet column + as before, we have to fill in the missing values with zero**

In [13]:
import re

In [15]:
# moving mentions and hashtags to a separate columns
the_data["hashtags_own"] = the_data.tweet.str.findall(r'#.*?(?=\s|$)')
the_data["mentions_own"] = the_data.tweet.str.findall(r'@.*?(?=\s|$)')

In [24]:
#removing hashtag and mentions sign from the tweet text -> removes only the first one? 
the_data['tweet'] = the_data['tweet'].str.replace('[!@#$]','')

  the_data['tweet'] = the_data['tweet'].str.replace('[!@#$]','')


**b) Next, we focus on emojis.**


In [25]:
try:
    import cPickle as pickle
except ImportError: 
    import pickle
import re


with open('Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

In [None]:
#to see the progress, install tqdm
#!pip install tqdm
#from tqdm.auto import tqdm

In [26]:
def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', " ".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

In [45]:
#type(test_df)
#ISSUE - object is a series
#the_data = the_data.pd.to_frame()
#the_data_df = the_data.to_frame()

<class 'pandas.core.frame.DataFrame'>


In [27]:
the_data["tweet_cleaned"] = the_data.apply(lambda x: convert_emojis_to_word(x["tweet"]), axis = 1)

KeyboardInterrupt: 

**c) To lower, split, stop-words using the pre_process_function together with clean_txt.**

In [35]:
import nltk
global dictionary
dictionary = nltk.corpus.words.words("en")
dictionary = [word.lower() for word in dictionary]

In [34]:
def clean_txt(txt_in):
    import re
    clean_str = re.sub("[^A-Za-z]+", " ", txt_in).strip().lower()
    return clean_str

def pre_process_text(tmp_f):
    tmp_f = clean_txt(tmp_f)
    tmp_f = [word_t.lower() for word_t in tmp_f.split(
        ) if word_t in dictionary]
    tmp_f = ' '.join(tmp_f)
    return tmp_f

In [60]:
the_data["tweet_cleaned"] = the_data.apply(lambda x: pre_process_text(x["tweet_cleaned"]), axis = 1)

KeyboardInterrupt: 

In [None]:
#stop words
def my_stop_words(var_in):
    from nltk.corpus import stopwords
    sw = stopwords.words('english')
    tmp = [word for word in var_in.split() if word not in sw]
    tmp = ' '.join(tmp)
    return tmp

In [None]:
the_data["tweet_cleaned"] = the_data.apply(lambda x: my_stop_words(x["tweet_cleaned"]), axis = 1)

**d) Save the cleaned data as a pickle.**

In [None]:
def write_pickle(path_in, file_name, var_in):
    import pickle
    pickle.dump(var_in, open(path_in + file_name, "wb"))

In [None]:
write_pickle(the_data_out, "data_cleaned.pkl", the_data)