In [4]:
import pandas as pd
import tweepy # For querying place_id and obtaining the coordinates
import re # For preprocessing the tweets

# For the implementation of BERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification 
import torch

import scipy.stats as stats # For ANOVA
from wordcloud import WordCloud, STOPWORDS # For the creation of word clouds
import folium # For displaying map

In [60]:
# Initialise datasets
df_pemilu_2024 = pd.read_csv(r'C:\Users\berna\Downloads\Data Twitter Bernard\data pemilu 2024 id.csv',low_memory=False)

df_Anies_Baswedan = pd.read_csv(r'C:\Users\berna\Downloads\Data Twitter Bernard\data_Anies Baswedan lang_id.csv',low_memory=False)

df_G20 = pd.read_csv(r'C:\Users\berna\Downloads\Data Twitter Bernard\data_G20 lang_id.csv',low_memory=False)

df_Omnibus_law = pd.read_csv(r'C:\Users\berna\Downloads\Data Twitter Bernard\data_Omnibus law lang_id.csv',low_memory=False)

df_Pemindahan_ibukota = pd.read_csv(r'C:\Users\berna\Downloads\Data Twitter Bernard\data_Pemindahan ibukota lang_id.csv',low_memory=False)

df_RKUHP = pd.read_csv(r'C:\Users\berna\Downloads\Data Twitter Bernard\data_RKUHP lang_id.csv',low_memory=False)

df_Sambo = pd.read_csv(r'C:\Users\berna\Downloads\Data Twitter Bernard\data_Sambo lang_id.csv',low_memory=False)

In [61]:
# Small fixes to the datasets

df_pemilu_2024.drop(index=[0,1], inplace=True) # Drops the first two rows since the rows are duplicates of the column labels
df_pemilu_2024.drop(columns=['tweet'], inplace=True) # Drops the column 'tweet' which contain no values
df_pemilu_2024.rename(columns={'source':'tweet'}, inplace=True) # Renames the column 'source' to 'tweet' because 'source' contains the tweets
df_pemilu_2024.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='') # Resets index to address the first two rows that were dropped


# These csv files do not include the appropriate column labels in the first row, the code below addresses that issue
cols = ['author id','created_at','geo','id','lang','like_count','quote_count','reply_count','retweet_count','tweet']

df_Anies_Baswedan = (df_Anies_Baswedan.T.reset_index().T.reset_index(drop=True).set_axis([i for i in cols],axis=1))
df_G20 = (df_G20.T.reset_index().T.reset_index(drop=True).set_axis([i for i in cols],axis=1))
df_Omnibus_law = (df_Omnibus_law.T.reset_index().T.reset_index(drop=True).set_axis([i for i in cols],axis=1))
df_Pemindahan_ibukota = (df_Pemindahan_ibukota.T.reset_index().T.reset_index(drop=True).set_axis([i for i in cols],axis=1))
df_RKUHP = (df_RKUHP.T.reset_index().T.reset_index(drop=True).set_axis([i for i in cols],axis=1))
df_Sambo = (df_Sambo.T.reset_index().T.reset_index(drop=True).set_axis([i for i in cols],axis=1))

In [62]:
# Eliminates hypertext links from the tweets

text_tweet_pemilu_2024 = " ".join(tweet for tweet in df_pemilu_2024['tweet'])
text_tweet_pemilu_2024 = re.sub(r'https?:\/\/.*[\r\n]*', '', text_tweet_pemilu_2024, flags=re.MULTILINE)

text_tweet_Anies_Baswedan = " ".join(tweet for tweet in df_Anies_Baswedan['tweet'])
text_tweet_Anies_Baswedan = re.sub(r'https?:\/\/.*[\r\n]*', '', text_tweet_Anies_Baswedan, flags=re.MULTILINE)

text_tweet_G20 = " ".join(tweet for tweet in df_G20['tweet'])
text_tweet_G20 = re.sub(r'https?:\/\/.*[\r\n]*', '', text_tweet_G20, flags=re.MULTILINE)

text_tweet_Omnibus_law = " ".join(tweet for tweet in df_Omnibus_law['tweet'])
text_tweet_Omnibus_law = re.sub(r'https?:\/\/.*[\r\n]*', '', text_tweet_Omnibus_law, flags=re.MULTILINE)

text_tweet_Pemindahan_ibukota = " ".join(tweet for tweet in df_Pemindahan_ibukota['tweet'])
text_tweet_Pemindahan_ibukota = re.sub(r'https?:\/\/.*[\r\n]*', '', text_tweet_Pemindahan_ibukota, flags=re.MULTILINE)

text_tweet_RKUHP = " ".join(tweet for tweet in df_RKUHP['tweet'])
text_tweet_RKUHP = re.sub(r'https?:\/\/.*[\r\n]*', '', text_tweet_RKUHP, flags=re.MULTILINE)

text_tweet_Sambo = " ".join(tweet for tweet in df_Sambo['tweet'])
text_tweet_Sambo = re.sub(r'https?:\/\/.*[\r\n]*', '', text_tweet_Sambo, flags=re.MULTILINE)

text_overall = text_tweet_pemilu_2024 + text_tweet_Anies_Baswedan + text_tweet_G20 + text_tweet_Omnibus_law + text_tweet_Pemindahan_ibukota + text_tweet_RKUHP + text_tweet_Sambo

Word Cloud Generation

In [63]:
# Initialise WordCloud object

stopwords = STOPWORDS
wc = WordCloud (
        background_color = 'white',
        stopwords = stopwords,
        height = 720,
        width = 480
)

# Generate word cloud

wc.generate(text_tweet_pemilu_2024)
wc.to_file('wc_pemilu_2024.png')

wc.generate(text_tweet_Anies_Baswedan)
wc.to_file('wc_Anies_Baswedan.png')

wc.generate(text_tweet_G20)
wc.to_file('wc_G20.png')

wc.generate(text_tweet_Omnibus_law)
wc.to_file('wc_Omnibus_law.png')

wc.generate(text_tweet_Pemindahan_ibukota)
wc.to_file('wc_Pemindahan_ibukota.png')

wc.generate(text_tweet_RKUHP)
wc.to_file('wc_RKUHP.png')

wc.generate(text_tweet_Sambo)
wc.to_file('wc_Sambo.png')

wc.generate(text_overall)
wc.to_file('wc_overall.png')

<wordcloud.wordcloud.WordCloud at 0x23fc1827070>

Sentiment Analysis

In [91]:
# Tokenization - Break down string into individual lexical units (words)
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Load model
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1 # Obtain the argmax and scale the integer with logits

In [98]:
# Apply the function to all the tweets in the csv file, add new column to contain the sentiment score, and save the updated csv file
# Note : The sentiment analysis has been performed on all the keywords. Simply use the updated csv files

df_Sambo['sentiment'] = df_Sambo['tweet'].apply(lambda x: sentiment_score(x[:512]))

df_Sambo.to_csv('data_Sambo with sentiment.csv', encoding='utf-8')

In [None]:
# Initialize new DataFrame object
df_sentiment_avg = pd.DataFrame(['pemilu 2024','Anies Baswedan','G20','Omnibus law','Pemindahan ibukota','RKUHP','Sambo'],columns=['opic'])

# Allows the iteration of the dataframes for each keyword/topic
dfs = [df_pemilu_2024,df_Anies_Baswedan,df_G20,df_Omnibus_law,df_Pemindahan_ibukota,df_RKUHP,df_Sambo]

new_column_sentiment_avg = []

# Calculate the average sentiment score for each keyword/topic
for df in dfs:
    sentiment_total = 0
    for i in df.index:
        sentiment_total += df['sentiment'][i]
    new_column_sentiment_avg.append(sentiment_total/len(df))
    
df_sentiment_avg['sentiment_avg'] = new_column_sentiment_avg


Tweets by Geographic Location

In [None]:
auth = tweepy.AppAuthHandler('M8Fv0HWuP3s3fFSqmnnFokclQ','vRZed45GpFlft8kurvaE3szC9avQySPKknmlaMfC4VnBvugNCP') # Input API key and API secret key to obtain authorization
api = tweepy.API(auth) # Initialize API with the authorization

In [None]:
temp = []

# Obtain the place name, latitude, and longitude from the place_id
for i in range(1,36):
    place_temp = api.geo_id(str(df_pemilu_2024['geo'].value_counts().index[i]))

    coords = place_temp.centroid

    data = {
        'place name': place_temp.full_name,
        'tweet count': df_pemilu_2024['geo'].value_counts()[i],
        'latitude': coords[1],
        'longitude': coords[0],
    }
    temp.append(data)

geoloc_pemilu_2024 = pd.DataFrame(temp)

In [65]:
# Save geolocation data as csv file

geoloc_Anies_Baswedan.to_csv('geoloc_Anies Baswedan.csv',encoding='utf-8')

df_geoloc_Anies_Baswedan = pd.read_csv('geoloc_Anies Baswedan.csv',low_memory=False)

df_geoloc_Anies_Baswedan

Unnamed: 0.1,Unnamed: 0,place name,tweet count,latitude,longitude
0,0,"Kota Jambi, Jambi",105,-1.621604,103.645906
1,1,"Pangkalan Baru, Indonesia",48,-2.369941,106.143882
2,2,"Gorontalo, Indonesia",16,0.673446,122.254388
3,3,"Bali, Indonesia",11,-8.455514,115.27046
4,4,"Banten, Indonesia",8,-6.437392,106.114829
5,5,"Padang Utara, Indonesia",7,-0.913014,100.357783
6,6,"Pondok Aren, Indonesia",6,-6.266571,106.708544
7,7,"West Papua, Indonesia",4,-0.23158,130.840537
8,8,"Pulo Gadung, Indonesia",4,-6.191189,106.895843
9,9,"South Sumatra, Indonesia",4,-3.321169,104.39114


In [66]:
# Create map
m_Anies_Baswedan = folium.Map(location=[0.133,116.783], zoom_start=4.5,tiles="Stamen Terrain")

# Insert markers
for i in range(len(df_geoloc_Anies_Baswedan)): 
    folium.Marker([df_geoloc_Anies_Baswedan['latitude'][i],df_geoloc_Anies_Baswedan['longitude'][i]],popup=df_geoloc_Anies_Baswedan['place name'][i],tooltip=str(df_geoloc_Anies_Baswedan['tweet count'][i])).add_to(m_Anies_Baswedan)

m_Anies_Baswedan

Hypothesis Testing

In [81]:
# Initialize the updated csv files
df_pemilu_2024 = pd.read_csv("data_pemilu_2024 with sentiment.csv")

df_Anies_Baswedan = pd.read_csv("data_Anies_Baswedan with sentiment.csv")

df_G20 = pd.read_csv("data_G20 with sentiment.csv")

df_Omnibus_law = pd.read_csv("data_Omnibus_law with sentiment.csv")

df_Pemindahan_ibukota = pd.read_csv("data_Pemindahan_ibukota with sentiment.csv")

df_RKUHP = pd.read_csv("data_RKUHP with sentiment.csv")

df_Sambo = pd.read_csv("data_Sambo with sentiment.csv")

In [90]:
# Performs ANOVA
stats.f_oneway(df_pemilu_2024, df_Anies_Baswedan, df_G20, df_Omnibus_law, df_Pemindahan_ibukota, df_RKUHP, df_Sambo)

F_onewayResult(statistic=1276.6626721469609, pvalue=0.0)