In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
import re, string, json, tweepy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from wordcloud import WordCloud

In [2]:
# Baca token untuk akses twitter API
with open("token.json")as tok:
  tokens = json.load(tok)

bearer_token = tokens['bearer_token']
api_key = tokens['api_key']
api_key_secret = tokens['api_key_secret']
access_token = tokens['access_token']
access_token_secret = tokens['access_token_secret']

In [3]:
# Autentikasi twitter API
api = tweepy.Client(bearer_token=bearer_token, wait_on_rate_limit=True)

# Kata kunci pencarian topik ini menggunakan hashtag agar lebih relevan
keyword = "#UkraineRussiaWar"
search_query = keyword + "-filter:retweets"

In [4]:
# Membuat request tweets sesuai kata kunci yang sudah ditentukan
response = tweepy.Paginator(
    api.search_recent_tweets,
    query=keyword,
    max_results=100
).flatten(limit=100)

# Menyimpan data tweets ke dalam array dan pandas dataframe dan menyimpan ke dalam file csv
tweets_array = []
for tweet in response:
    tweets_array.append(tweet.text.strip())

df_tweets = pd.DataFrame(tweets_array, columns = ['tweet'])

# Menampilkan dataframe
display(df_tweets.head())

Unnamed: 0,tweet
0,RT @BeachfrontBeavs: 🎉MINT IS LIVE🎉\n\nJoin th...
1,RT @Bill91957697: 4月19日，#UkraineRussiaWar \n#U...
2,What #USA and #NATO want in #Donbas?\n\n#Ukrai...
3,RT @ItaMilRadar: New morning of missions for #...
4,RT @ItaMilRadar: Also today mission over the B...


In [4]:
# Fungsi-fungsi yang diperlukan

## Case Folding
def case_folding(data):
    data = data.lower()
    data = re.sub('@[^\s]+','',data)
    data = ' '.join(re.sub("(#[A-Za-z0-9]+)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",data).split())
    data = re.sub(r"\d+", "", data)
    data = re.sub(r"\n","",data)
    data = re.sub(r"\t","",data)
    data = data.translate(str.maketrans("","",string.punctuation))
    return data

## Remove Stopwords
def remove_stopwords(data):
    sw_indonesia = stopwords.words("indonesian")
    data  = [word for word in data if word not in sw_indonesia]
    data = ' '.join(data)
    return data

# Stemming
def words_stemming(data):
    ps = PorterStemmer()
    data = ps.stem(data)
    return data     

## Remove Slang
def remove_slang(data, slang_data):
    remove_slang = []
    for tweet in clean_text_tweets:
        word_list = []
        for word in word_tokenize(tweet):
            if slang_data.get(word):
                word = slang_data[word]
            word_list.append(word)
        remove_slang.append(' '.join(word_list))
    return remove_slang

## Distribusi Frekuensi Kata
def word_frequency(list):
    tokenizedData = []
    for sentence in list:
        t_kata = word_tokenize(sentence)
        tokenizedData += t_kata

    return FreqDist(tokenizedData)

In [8]:
# Menjalankan pembersihan data
clean_tweets = []
for i in range(len(df_tweets['tweet'])):
    data = case_folding(df_tweets['tweet'][i])
    data = word_tokenize(data)
    data = remove_stopwords(data)
    # data = words_stemming(data)
    clean_tweets.append(data)

In [11]:
df_tweets['clean_tweet'] = clean_tweets
df_tweets.head()

Unnamed: 0,tweet,clean_tweet
0,RT @BeachfrontBeavs: 🎉MINT IS LIVE🎉\n\nJoin th...,rt mint is live join the colony of beavers bui...
1,RT @Bill91957697: 4月19日，#UkraineRussiaWar \n#U...,rt
2,What #USA and #NATO want in #Donbas?\n\n#Ukrai...,what and want in
3,RT @ItaMilRadar: New morning of missions for #...,rt new morning of missions for assets
4,RT @ItaMilRadar: Also today mission over the B...,rt also today mission over the black sea for a...


In [12]:
df_tweets = df_tweets.drop_duplicates()

nan_value = float("NaN")
df_tweets.replace("", nan_value, inplace=True)
df_tweets.dropna(subset = ["clean_tweet"], inplace=True)
df_tweets = df_tweets.reset_index(drop=True)

display(df_tweets)

Unnamed: 0,tweet,clean_tweet
0,RT @BeachfrontBeavs: 🎉MINT IS LIVE🎉\n\nJoin th...,rt mint is live join the colony of beavers bui...
1,RT @Bill91957697: 4月19日，#UkraineRussiaWar \n#U...,rt
2,What #USA and #NATO want in #Donbas?\n\n#Ukrai...,what and want in
3,RT @ItaMilRadar: New morning of missions for #...,rt new morning of missions for assets
4,RT @ItaMilRadar: Also today mission over the B...,rt also today mission over the black sea for a...
...,...,...
64,RT @james_oaten: Putin's invasion of Ukraine o...,rt putin s invasion of ukraine opened a year o...
65,SBU already arrested her 😰😰😰🤮 tread 👇\n\n#Ukra...,sbu already arrested her tread
66,Putin incorporates diversity\n35% OF THE SALES...,putin incorporates diversity of the sales will...
67,Canada #inflation in March 2022 is now 6.70%. ...,canada in march is now bank of canada wants to...
