# Data Preparation

In [21]:
# import all libraries for notebook
import pandas as pd
from pandas import json_normalize
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import json
from json import loads, dumps
from pprint import pprint

In [11]:
# Import files
df_replies = pd.read_csv('../data/twitter/tweets_isReply.csv', dtype={'id': 'object'}, low_memory=False)
df_tweets = pd.read_csv('../data/twitter/tweets_isTweet.csv', dtype={'id': 'object'}, low_memory=False)

# Create and prepare Dataframes
# Original tweets
df_nonQuotedTweets = df_tweets[df_tweets['quoted_tweet'].isna()]
df_nonQuotedTweets = df_nonQuotedTweets.rename(columns={'id': 'tweet_id', 'text': 'tweet_text'})
# Quoted tweets
df_quotedTweets = df_tweets[['id', 'text', 'quoted_tweet']].dropna(subset=['quoted_tweet'])
df_quotedTweets = df_quotedTweets.rename(columns={'id': 'tweet_id', 'text': 'tweet_text'})
# Cards
df_tweetsWithCard = df_tweets[['id', 'card']]
df_tweetsWithCard = df_tweetsWithCard.dropna(subset=['card'])
df_tweetsWithCard = df_tweetsWithCard.rename(columns={'id': 'tweet_id'})

In [15]:
print("[Max][df_tweets] ", df_tweets['createdAt'].max(), "\n")
print("[Min][df_tweets] ", df_tweets['createdAt'].min(), "\n")
print("[Max][df_replies] ", df_replies['createdAt'].max(), "\n")
print("[Max][df_replies] ", df_replies['createdAt'].min(), "\n")

[Max][df_tweets]  Wed Sep 27 23:06:04 +0000 2023 

[Min][df_tweets]  Fri Apr 04 02:42:55 +0000 2025 

[Max][df_replies]  Wed Sep 27 23:32:38 +0000 2023 

[Max][df_replies]  Fri Apr 04 02:22:25 +0000 2025 



In [17]:
# Functions
# Removing unnecessary data
def data_processing(text):
    text = text.lower()
    text = re.sub(r"https\S+|www\S+httpss\S+", '', text, flags=re.MULTILINE) # Remove Url
    text = re.sub(r"\@w+|\#", '', text) # remove @ and #
    text = re.sub(r"[^\w\s]", '', text) # remove punctuation
    text_tokens = text.split()
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

# Reduction of dimensionality by abstracting word to word stem
stemmer = PorterStemmer() 
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

def polarity(text):
    return TextBlob(text).sentiment.polarity

def sentiment(label):
    if label < 0:
        return "negative"
    elif label == 0:
        return "neutral"
    elif label > 0:
        return "positive"
    return None

In [18]:
# Prepare quoted tweets

# Normalize quoted_tweet column
df_quotedTweets_normalized = pd.json_normalize(
    df_quotedTweets['quoted_tweet'].apply(json.loads),
    sep='_'
)

# Link both dataframes by index
df_quotedTweets_normalized.index = df_quotedTweets.index

# Rename columns
df_quotedTweets_normalized.columns = ['quoted_tweet_' + col for col in df_quotedTweets_normalized.columns]

# Concatenate the two dataframes
df_final = pd.concat([
    df_quotedTweets[['tweet_id', 'tweet_text']],
    df_quotedTweets_normalized[['quoted_tweet_id', 'quoted_tweet_text']]
], axis=1)

df_final.head()

Unnamed: 0,tweet_id,tweet_text,quoted_tweet_id,quoted_tweet_text
2,1917225430702240067,This is a big deal,1917223651625099407,"Last week, Treasury went live with its first a..."
4,1917103264417649121,Whoa,1917011279757066291,🚨THE INVISIBLE PUPPET MASTERS: AI'S DISTURBING...
5,1917099777327829386,"Next week, Grok 3.5 early beta release to Supe...",1917011847623987257,🚨GROK 3 SENDS USAGE SOARING – 10X SPIKE IN DOW...
6,1917071819003334728,It is an existential crisis!,1917059115417014780,A friendly reminder to make more babies!\n\n🇯🇵...
9,1917040536378335721,Starlink is trying out a service plan commitme...,1917029886432317947,$0 for the Standard Kit with 12-month resident...


In [25]:
# Pre-processing
df_final.quoted_tweet_text = df_final['quoted_tweet_text'].apply(data_processing)
df_final.tweet_text = df_final['tweet_text'].apply(data_processing)

# Stemming
df_final['quoted_tweet_text'] = df_final['quoted_tweet_text'].apply(lambda x: stemming(x))

# Polarity
df_final['polarity'] = df_final['quoted_tweet_text'].apply(polarity)

# Sentiment 
df_final['sentiment'] = df_final['polarity'].apply(sentiment)

print(df_final.head(), "\n")
print(df_final.shape)

              tweet_id                                         tweet_text  \
2  1917225430702240067                                           big deal   
4  1917103264417649121                                               whoa   
5  1917099777327829386  next week grok 35 early beta release supergrok...   
6  1917071819003334728                                 existential crisis   
9  1917040536378335721  starlink trying service plan commitment exchan...   

       quoted_tweet_id                                  quoted_tweet_text  \
2  1917223651625099407  last week treasury went live first automated p...   
4  1917011279757066291  invisible puppet masters ais disturbing new ro...   
5  1917011847623987257  grok 3 sends usage soaring 10x spike downloads...   
6  1917059115417014780  friendly reminder make babies japans total fer...   
9  1917029886432317947  0 standard kit 12month residential service pla...   

   polarity sentiment  
2 -0.005519  negative  
4  0.066279  positive  
5 