In [1]:
# Import Data Preprocessing and Wrangling libraries
import re
from tqdm.notebook import tqdm
import pandas as pd 
import numpy as np
from datetime import datetime
import dateutil.parser

# Import NLP Libraries
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

# Import Visualization Libraries

import matplotlib.pyplot as plt
from nltk.corpus import stopwords

In [2]:
quotes = pd.read_csv('quote_tweets.csv')
quotes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7147 entries, 0 to 7146
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   orig_tweet_created_at      7147 non-null   object 
 1   musk_tweet_id              7147 non-null   int64  
 2   musk_quote_tweet           7147 non-null   object 
 3   musk_quote_retweet_count   7147 non-null   int64  
 4   musk_quote_reply_count     7147 non-null   int64  
 5   musk_quote_like_count      7147 non-null   int64  
 6   musk_quote_quote_count     7147 non-null   int64  
 7   musk_quote_view_count      6653 non-null   float64
 8   musk_quote_bookmark_count  7147 non-null   int64  
 9   musk_quote_created_at      7147 non-null   object 
dtypes: float64(1), int64(6), object(3)
memory usage: 558.5+ KB


In [3]:
quotes = quotes.rename(columns = {'orig_tweet_created_at' : 'og_tweet_timestamp', 'musk_tweet_id' : 'tweet_id', 'musk_quote_tweet' : 'tweet_body', 'musk_quote_retweet_count' : 'retweet_count', 'musk_quote_reply_count' : 'reply_count', 'musk_quote_like_count' : 'like_count', 'musk_quote_quote_count' : 'quote_count', 'musk_quote_view_count' : 'view_count', 'musk_quote_bookmark_count' : 'bookmark_count', 'musk_quote_created_at' : 'tweet_timestamp'})

quotes.drop(['og_tweet_timestamp'], axis = 1, inplace = True)

new_order = ['tweet_id', 'tweet_timestamp', 'tweet_body', 'retweet_count', 'reply_count', 'like_count', 'quote_count', 'view_count', 'bookmark_count']

quotes = quotes[new_order]
quotes.head()

Unnamed: 0,tweet_id,tweet_timestamp,tweet_body,retweet_count,reply_count,like_count,quote_count,view_count,bookmark_count
0,1655978502187778073,2023-05-09 16:50:16+00:00,Yup,3255,3747,39533,225,11392206.0,281
1,1655968899903418373,2023-05-09 16:12:06+00:00,Massive public manipulation,9811,2694,49528,534,14404853.0,1241
2,1647339741610926080,2023-04-15 20:42:55+00:00,💯,8257,4793,106036,376,18520248.0,366
3,1646228474628280326,2023-04-12 19:07:08+00:00,🤣🤣,10198,5076,108462,430,16690340.0,340
4,1640171198091866114,2023-03-27 01:57:41+00:00,Prescient,9193,5118,56272,572,25169601.0,3792


In [4]:
originals = pd.read_csv('original_tweets.csv')

In [5]:
originals = originals.rename(columns = {'orig_tweet_id' : 'tweet_id', 'orig_tweet_created_at' : 'tweet_timestamp', 'orig_tweet_text' : 'tweet_body', 'orig_tweet_retweet_count' : 'retweet_count', 'orig_tweet_reply_count' : 'reply_count', 'orig_tweet_like_count' : 'like_count', 'orig_tweet_view_count' : 'view_count', 'orig_tweet_bookmark_count' : 'bookmark_count'})

originals.drop(['orig_tweet_username'], axis = 1, inplace = True)

originals.head()

Unnamed: 0,tweet_id,tweet_timestamp,tweet_body,retweet_count,reply_count,like_count,view_count,bookmark_count
0,1668843112955891712,2023-06-14 04:49:38+00:00,Bold,49375,18263,334997,127858792.0,6344
1,1665784889008873476,2023-06-05 18:17:20+00:00,https://t.co/OT6bqzUK8P,10205,7183,58206,34809220.0,2905
2,1656748197308674048,2023-05-11 19:48:45+00:00,Excited to announce that I’ve hired a new CEO ...,30150,31364,361878,120525183.0,2900
3,999367582271422464,2018-05-23 19:12:53+00:00,Going to create a site where the public can ra...,41664,17931,235900,,1333
4,1619415871902056449,2023-01-28 19:23:25+00:00,"@atensnut Nonetheless, I apologize",414,1166,8046,1923932.0,59


In [6]:
all_tweets = pd.concat([quotes, originals], ignore_index = True)

all_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7277 entries, 0 to 7276
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tweet_id         7277 non-null   int64  
 1   tweet_timestamp  7277 non-null   object 
 2   tweet_body       7277 non-null   object 
 3   retweet_count    7277 non-null   int64  
 4   reply_count      7277 non-null   int64  
 5   like_count       7277 non-null   int64  
 6   quote_count      7147 non-null   float64
 7   view_count       6746 non-null   float64
 8   bookmark_count   7277 non-null   int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 511.8+ KB


In [7]:
all_tweets['date'] = pd.to_datetime(all_tweets['tweet_timestamp'])
all_tweets['date'] = all_tweets['date'].dt.tz_localize(None)
all_tweets['date'] = all_tweets['date'].dt.strftime('%m-%d-%Y')
all_tweets.drop(['tweet_timestamp'], axis = 1, inplace = True)
all_tweets.head()

Unnamed: 0,tweet_id,tweet_body,retweet_count,reply_count,like_count,quote_count,view_count,bookmark_count,date
0,1655978502187778073,Yup,3255,3747,39533,225.0,11392206.0,281,05-09-2023
1,1655968899903418373,Massive public manipulation,9811,2694,49528,534.0,14404853.0,1241,05-09-2023
2,1647339741610926080,💯,8257,4793,106036,376.0,18520248.0,366,04-15-2023
3,1646228474628280326,🤣🤣,10198,5076,108462,430.0,16690340.0,340,04-12-2023
4,1640171198091866114,Prescient,9193,5118,56272,572.0,25169601.0,3792,03-27-2023


In [8]:
import re

# Define cleaning function
def clean_tweet(text):
    if pd.isnull(text):
        return ""
    text = text.lower()  # lowercase
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)     # remove @mentions
    text = re.sub(r'#\w+', '', text)     # remove hashtags
    text = re.sub(r'rt[\s]+', '', text)  # remove retweet "RT"
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

# Apply to the tweet text column
all_tweets['clean_text'] = all_tweets['tweet_body'].apply(clean_tweet)

# Preview
all_tweets.head()

Unnamed: 0,tweet_id,tweet_body,retweet_count,reply_count,like_count,quote_count,view_count,bookmark_count,date,clean_text
0,1655978502187778073,Yup,3255,3747,39533,225.0,11392206.0,281,05-09-2023,yup
1,1655968899903418373,Massive public manipulation,9811,2694,49528,534.0,14404853.0,1241,05-09-2023,massive public manipulation
2,1647339741610926080,💯,8257,4793,106036,376.0,18520248.0,366,04-15-2023,
3,1646228474628280326,🤣🤣,10198,5076,108462,430.0,16690340.0,340,04-12-2023,
4,1640171198091866114,Prescient,9193,5118,56272,572.0,25169601.0,3792,03-27-2023,prescient


In [9]:
all_tweets.to_csv('all_musk_tweets.csv', index = False)