In [1]:
import pandas as pd

#allows pandas to display up to 999 rows and 25 columns at one time
pd.options.display.max_rows = 999
pd.options.display.max_columns = 25

### Import the raw csv data from our Twitter scraper

In [2]:
elon_musk_tweets = pd.read_csv('data/user-tweets-elonmusk.csv')
mark_cuban_tweets = pd.read_csv('data/user-tweets-mcuban.csv')
jack_dorsey_tweets = pd.read_csv('data/user-tweets-jack.csv')
vladimir_tenev_tweets = pd.read_csv('data/user-tweets-vladtenev.csv')
brian_armstrong_tweets = pd.read_csv('data/user-tweets-brian_armstrong.csv')
bespoke_crypto_tweets = pd.read_csv('data/user-tweets-BespokeCrypto.csv')

#### DataFrame of Elon Musk's tweets to show information in raw_csv

In [3]:
elon_musk_tweets.head()

Unnamed: 0,url,date,content,renderedContent,id,user,outlinks,tcooutlinks,replyCount,retweetCount,likeCount,quoteCount,conversationId,lang,source,sourceUrl,sourceLabel,media,retweetedTweet,quotedTweet,mentionedUsers,coordinates,place
0,https://twitter.com/elonmusk/status/1389270326...,2021-05-03 17:27:11+00:00,@MeetLuis @WARREZ420 @WholeMarsBlog Rawlinson ...,@MeetLuis @WARREZ420 @WholeMarsBlog Rawlinson ...,1389270326073253889,"{'username': 'elonmusk', 'displayname': 'Elon ...",[],[],368,219,2281,100,1389151359648223233,en,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,,"[{'username': 'MeetLuis', 'displayname': 'Luis...",,
1,https://twitter.com/elonmusk/status/1389126650...,2021-05-03 07:56:16+00:00,@spacex360 So great to see the happy faces!,@spacex360 So great to see the happy faces!,1389126650508480512,"{'username': 'elonmusk', 'displayname': 'Elon ...",[],[],774,943,29807,42,1388778987044982788,en,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,,"[{'username': 'spacex360', 'displayname': 'Eve...",,
2,https://twitter.com/elonmusk/status/1389102532...,2021-05-03 06:20:26+00:00,"@utsavtechie Prototypes are easy, production i...","@utsavtechie Prototypes are easy, production i...",1389102532706848768,"{'username': 'elonmusk', 'displayname': 'Elon ...",[],[],2791,8029,159645,648,1388795182582099968,en,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,,"[{'username': 'utsavtechie', 'displayname': 'U...",,
3,https://twitter.com/elonmusk/status/1388980879...,2021-05-02 22:17:02+00:00,@heydave7 @Tesla A remarkable junction in history,@heydave7 @Tesla A remarkable junction in history,1388980879175954433,"{'username': 'elonmusk', 'displayname': 'Elon ...",[],[],918,865,16742,71,1388610791885217797,en,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,,"[{'username': 'heydave7', 'displayname': 'Dave...",,
4,https://twitter.com/elonmusk/status/1388963849...,2021-05-02 21:09:21+00:00,@jaentwistle One of many reasons why we need l...,@jaentwistle One of many reasons why we need l...,1388963849043419140,"{'username': 'elonmusk', 'displayname': 'Elon ...",[],[],1152,1474,32663,78,1388962263030190084,en,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,,"[{'username': 'jaentwistle', 'displayname': 'J...",,


### Look at what information is in columns to determine what is needed

In [4]:
#want to look at all availible columns and find out what each of them are for
elon_musk_tweets.columns

Index(['url', 'date', 'content', 'renderedContent', 'id', 'user', 'outlinks',
       'tcooutlinks', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount',
       'conversationId', 'lang', 'source', 'sourceUrl', 'sourceLabel', 'media',
       'retweetedTweet', 'quotedTweet', 'mentionedUsers', 'coordinates',
       'place'],
      dtype='object')

In [5]:
#dataset has 10,000 rows, one for each tweet, and 23 columns
elon_musk_tweets.shape

(10000, 23)

In [6]:
#all entries have tweet content, a date, and a unique id
elon_musk_tweets.isna()['content'].value_counts(), elon_musk_tweets.isna()['date'].value_counts(), \
elon_musk_tweets.isna()['id'].value_counts()

(False    10000
 Name: content, dtype: int64,
 False    10000
 Name: date, dtype: int64,
 False    10000
 Name: id, dtype: int64)

In [7]:
elon_musk_tweets.dtypes

url                 object
date                object
content             object
renderedContent     object
id                   int64
user                object
outlinks            object
tcooutlinks         object
replyCount           int64
retweetCount         int64
likeCount            int64
quoteCount           int64
conversationId       int64
lang                object
source              object
sourceUrl           object
sourceLabel         object
media               object
retweetedTweet     float64
quotedTweet         object
mentionedUsers      object
coordinates        float64
place              float64
dtype: object

### Column Descriptions:
#### Name - Keeping/Removing - Description

**url** - removing - provides the url to the tweet  
**date** - keeping - provides the date and time for when the tweet was published  
**content** - keeping - the actual text of the tweet  
**renderedContent** - removing - seems to be a direct repeat of 'content'  
**id** - keeping - unique id for each tweet  
**user** - keeping - provides user information including name, username, id, description, verified status, and more  
**outlinks** - removing - indicates any links to outside websites  
**tcooutlinks** - removing - shortend version of links in 'outlinks'  
**replyCount** - keeping - number of replies a tweet recieved  
**retweetCount** - keeping - number of retweets a tweet recieved  
**retweetCount** - keeping - number of retweets a tweet recieved  
**likeCount** - keeping - number of likes a tweet recieved  
**quoteCount** - keeping - number of times a tweet was quoted 
**conversationId** - removing - unique id for the conversation  
**lang** - removing - the language the tweet was made in  
**source** - removing - shows the raw source for where the tweet came from
**sourceURL** - removing - cleaned url from the 'source' column  
**sourceLabel** - removing - cleaned device name for where the tweet came from  
**media** - removing - link to any imbeded photos/videos in the tweet  
**retweetedTweet** - removing - link to original tweeted if this is a retweet  
**quotedTweet** - removing - link to original tweeted if this tweet is quoting another  
**mentionedUsers** - removing - twitter profile information for anyone tagged in the tweet  
**coordinates** - removing - show the gps coordinates of any linked location  
**place** - removing - shows the place of any linked location

### Many Columns have unneeded information --> Drop Them

In [8]:
columns_to_drop = ['url', 'renderedContent', 'outlinks', 'tcooutlinks', 'conversationId', 'lang', 'source', 
                    'sourceUrl', 'sourceLabel', 'media','retweetedTweet', 'quotedTweet', 'mentionedUsers', 
                    'coordinates','place']
elon_musk_tweets = elon_musk_tweets.drop(columns=columns_to_drop)
mark_cuban_tweets = mark_cuban_tweets.drop(columns=columns_to_drop)
jack_dorsey_tweets = jack_dorsey_tweets.drop(columns=columns_to_drop)
vladimir_tenev_tweets = vladimir_tenev_tweets.drop(columns=columns_to_drop)
brian_armstrong_tweets = brian_armstrong_tweets.drop(columns=columns_to_drop)
bespoke_crypto_tweets = bespoke_crypto_tweets.drop(columns=columns_to_drop)

In [9]:
 elon_musk_tweets.shape

(10000, 8)

### 'Date' column is still messy to look at --> Add columns for more accurate date/time

In [10]:
def year_from_date(date_time):
    year = date_time.split('-')[0]
    return int(year)

def month_from_date(date_time):
    month = date_time.split('-')[1]
    return int(month)

def day_from_date(date_time):
    date = date_time.split(' ')[0]
    day = date.split('-')[2]
    return int(day)

In [11]:
#Elon Musk
elon_musk_tweets = elon_musk_tweets.assign(year = elon_musk_tweets.date.apply(year_from_date))
elon_musk_tweets = elon_musk_tweets.assign(month = elon_musk_tweets.date.apply(month_from_date))
elon_musk_tweets = elon_musk_tweets.assign(day = elon_musk_tweets.date.apply(day_from_date))

#Mark Cuban
mark_cuban_tweets = mark_cuban_tweets.assign(year = mark_cuban_tweets.date.apply(year_from_date))
mark_cuban_tweets = mark_cuban_tweets.assign(month = mark_cuban_tweets.date.apply(month_from_date))
mark_cuban_tweets = mark_cuban_tweets.assign(day = mark_cuban_tweets.date.apply(day_from_date))

#Jack Dorsey
jack_dorsey_tweets = jack_dorsey_tweets.assign(year = jack_dorsey_tweets.date.apply(year_from_date))
jack_dorsey_tweets = jack_dorsey_tweets.assign(month = jack_dorsey_tweets.date.apply(month_from_date))
jack_dorsey_tweets = jack_dorsey_tweets.assign(day = jack_dorsey_tweets.date.apply(day_from_date))

#Vladimir Tenev
vladimir_tenev_tweets = vladimir_tenev_tweets.assign(year = vladimir_tenev_tweets.date.apply(year_from_date))
vladimir_tenev_tweets = vladimir_tenev_tweets.assign(month = vladimir_tenev_tweets.date.apply(month_from_date))
vladimir_tenev_tweets = vladimir_tenev_tweets.assign(day = vladimir_tenev_tweets.date.apply(day_from_date))

#Brian Armstrong
brian_armstrong_tweets = brian_armstrong_tweets.assign(year = brian_armstrong_tweets.date.apply(year_from_date))
brian_armstrong_tweets = brian_armstrong_tweets.assign(month = brian_armstrong_tweets.date.apply(month_from_date))
brian_armstrong_tweets = brian_armstrong_tweets.assign(day = brian_armstrong_tweets.date.apply(day_from_date))

#Bespoke Crypto
bespoke_crypto_tweets = bespoke_crypto_tweets.assign(year = elon_musk_tweets.date.apply(year_from_date))
bespoke_crypto_tweets = bespoke_crypto_tweets.assign(month = elon_musk_tweets.date.apply(month_from_date))
bespoke_crypto_tweets = bespoke_crypto_tweets.assign(day = elon_musk_tweets.date.apply(day_from_date))

### User column is really ugly --> Update it to just show user's display name

In [12]:
def get_display_name(user):
    name = user.split(',')[1].split("'")[3]
    return name

In [13]:
#Elon Musk
elon_musk_tweets = elon_musk_tweets.assign(user = elon_musk_tweets.user.apply(get_display_name))

#Mark Cuban
mark_cuban_tweets = mark_cuban_tweets.assign(user = mark_cuban_tweets.user.apply(get_display_name))

#Jack Dorsey
jack_dorsey_tweets = jack_dorsey_tweets.assign(user = jack_dorsey_tweets.user.apply(get_display_name))

#Vladimir Tenev
vladimir_tenev_tweets = vladimir_tenev_tweets.assign(user = vladimir_tenev_tweets.user.apply(get_display_name))

#Brian Armstrong
brian_armstrong_tweets = brian_armstrong_tweets.assign(user = brian_armstrong_tweets.user.apply(get_display_name))

#Bespoke Crypto
bespoke_crypto_tweets = bespoke_crypto_tweets.assign(user = bespoke_crypto_tweets.user.apply(get_display_name))

### We are only interested in tweets dating back to 2018 --> Drop earlier dates

In [14]:
#Elon Musk
elon_musk_tweets = elon_musk_tweets[elon_musk_tweets.get('year') >= 2018]

#Mark Cuban
mark_cuban_tweets = mark_cuban_tweets[mark_cuban_tweets.get('year') >= 2018]

#Jack Dorsey
jack_dorsey_tweets = jack_dorsey_tweets[jack_dorsey_tweets.get('year') >= 2018]

#Vladimir Tenev
vladimir_tenev_tweets = vladimir_tenev_tweets[vladimir_tenev_tweets.get('year') >= 2018]

#Brian Armstrong
brian_armstrong_tweets = brian_armstrong_tweets[brian_armstrong_tweets.get('year') >= 2018]

#Bespoke Crypto
bespoke_crypto_tweets = bespoke_crypto_tweets[bespoke_crypto_tweets.get('year') >= 2018]

In [15]:
elon_musk_tweets.shape

(9502, 11)

In [16]:
mark_cuban_tweets.shape

(2635, 11)

In [17]:
jack_dorsey_tweets.shape

(3355, 11)

In [18]:
vladimir_tenev_tweets.shape

(239, 11)

In [19]:
brian_armstrong_tweets.shape

(185, 11)

In [20]:
bespoke_crypto_tweets.shape

(187, 11)

### DataFrame of Elon Musk's tweets to show cleaned data

In [21]:
elon_musk_tweets.head()

Unnamed: 0,date,content,id,user,replyCount,retweetCount,likeCount,quoteCount,year,month,day
0,2021-05-03 17:27:11+00:00,@MeetLuis @WARREZ420 @WholeMarsBlog Rawlinson ...,1389270326073253889,Elon Musk,368,219,2281,100,2021,5,3
1,2021-05-03 07:56:16+00:00,@spacex360 So great to see the happy faces!,1389126650508480512,Elon Musk,774,943,29807,42,2021,5,3
2,2021-05-03 06:20:26+00:00,"@utsavtechie Prototypes are easy, production i...",1389102532706848768,Elon Musk,2791,8029,159645,648,2021,5,3
3,2021-05-02 22:17:02+00:00,@heydave7 @Tesla A remarkable junction in history,1388980879175954433,Elon Musk,918,865,16742,71,2021,5,2
4,2021-05-02 21:09:21+00:00,@jaentwistle One of many reasons why we need l...,1388963849043419140,Elon Musk,1152,1474,32663,78,2021,5,2


### Combine all Influencer's tweets into a common dataframe

In [22]:
tweets = (elon_musk_tweets.merge(mark_cuban_tweets, how='outer')
                        .merge(jack_dorsey_tweets, how='outer')
                        .merge(vladimir_tenev_tweets, how='outer')
                        .merge(brian_armstrong_tweets, how='outer')
                        .merge(bespoke_crypto_tweets, how='outer'))

In [23]:
#check to make sure all tweets were preserved during the merge
total_num_tweet = elon_musk_tweets.shape[0] + mark_cuban_tweets.shape[0] + jack_dorsey_tweets.shape[0] \
                    + vladimir_tenev_tweets.shape[0] + brian_armstrong_tweets.shape[0] \
                    + bespoke_crypto_tweets.shape[0]
total_num_tweet == tweets.shape[0]

True

In [24]:
tweets.head()

Unnamed: 0,date,content,id,user,replyCount,retweetCount,likeCount,quoteCount,year,month,day
0,2021-05-03 17:27:11+00:00,@MeetLuis @WARREZ420 @WholeMarsBlog Rawlinson ...,1389270326073253889,Elon Musk,368,219,2281,100,2021,5,3
1,2021-05-03 07:56:16+00:00,@spacex360 So great to see the happy faces!,1389126650508480512,Elon Musk,774,943,29807,42,2021,5,3
2,2021-05-03 06:20:26+00:00,"@utsavtechie Prototypes are easy, production i...",1389102532706848768,Elon Musk,2791,8029,159645,648,2021,5,3
3,2021-05-02 22:17:02+00:00,@heydave7 @Tesla A remarkable junction in history,1388980879175954433,Elon Musk,918,865,16742,71,2021,5,2
4,2021-05-02 21:09:21+00:00,@jaentwistle One of many reasons why we need l...,1388963849043419140,Elon Musk,1152,1474,32663,78,2021,5,2


In [25]:
tweets.shape

(16103, 11)

In [26]:
tweets.dtypes

date            object
content         object
id               int64
user            object
replyCount       int64
retweetCount     int64
likeCount        int64
quoteCount       int64
year             int64
month            int64
day              int64
dtype: object