In [17]:
import pandas as pd
import numpy as np
import os
import glob

## Data path

In [2]:
comments_path = "data_comments_crawled/"
quotes_path = 'data_quotes_crawled/'

### Load files and append them into a list of DataFrames

In [3]:
csv_pat = os.path.join(comments_path, '*''*.csv')
csv_fil = glob.glob(csv_pat)

df_list = []
for f in csv_fil:
    try:
        df = pd.read_csv(f)
        df_list.append(df)
        print(f"loaded {os.path.basename(f)} (rows: {len(df)})")
    except Exception as e:
        print(f"Failed load {os.path.basename(f)}: {e}")

loaded 17 tuntutan rakyat dalam satu minggu ke depan, dan 8 tuntutan rakyat dalam satu tahun ke depan.csv (rows: 58)
loaded Anggota Kolektif 17+8 Indonesia Berbenah.csv (rows: 129)
loaded Kakak OJOL ini ternyata lebih cerdas, lebih waras daripada influencer yang suka demo dengan tuntutan 17+8.csv (rows: 68)
loaded ‘17+8 Tuntutan Rakyat’ have been released.csv (rows: 204)
loaded DPR RI menanggapi 17+8 Tuntutan Rakyat dengan mengeluarkan enam poin keputusan.csv (rows: 31)
loaded 17 tuntutan dalam 1 minggu + 8 tuntutan dalam 1 tahun.csv (rows: 10)
loaded DPR to hold meeting with factions today following nationwide protests over 17+8 demands.csv (rows: 71)
loaded Menteri Keuangan Purbaya Yudhi Sadewa meminta maaf soal komentarnya mengenai tuntutan 17+8.csv (rows: 83)
loaded 7+8 Tuntutan Rakyat dalam seminggu dan setahun (dengan deadline).csv (rows: 35)
loaded Masalah w sama tuntutan 17+8 cuma satu.csv (rows: 34)
loaded KONFERENSI PERS TUNTUTAN 17+8.csv (rows: 86)
loaded “Saya kira ini masu

In [5]:
csv_quotes = os.path.join(quotes_path, "*.csv")
quotes_file = glob.glob(csv_quotes)

quotes_list = []

for f in quotes_file:
    df = pd.read_csv(f)
    quotes_list.append(df)
    print(f"Loaded (Rows {len(df)}) \t: {os.path.basename(f)}")

Loaded (Rows 48) 	: 17 tuntutan rakyat dalam satu minggu ke depan, dan 8 tuntutan rakyat dalam satu tahun ke depan.csv
Loaded (Rows 132) 	: Anggota Kolektif 17+8 Indonesia Berbenah.csv
Loaded (Rows 6) 	: Kakak OJOL ini ternyata lebih cerdas, lebih waras daripada influencer yang suka demo dengan tuntutan 17+8.csv
Loaded (Rows 149) 	: ‘17+8 Tuntutan Rakyat’ have been released.csv
Loaded (Rows 18) 	: DPR RI menanggapi 17+8 Tuntutan Rakyat dengan mengeluarkan enam poin keputusan.csv
Loaded (Rows 40) 	: 17 tuntutan dalam 1 minggu + 8 tuntutan dalam 1 tahun.csv
Loaded (Rows 59) 	: DPR to hold meeting with factions today following nationwide protests over 17+8 demands.csv
Loaded (Rows 37) 	: Menteri Keuangan Purbaya Yudhi Sadewa meminta maaf soal komentarnya mengenai tuntutan 17+8.csv
Loaded (Rows 101) 	: 7+8 Tuntutan Rakyat dalam seminggu dan setahun (dengan deadline).csv
Loaded (Rows 7) 	: Masalah w sama tuntutan 17+8 cuma satu.csv
Loaded (Rows 154) 	: KONFERENSI PERS TUNTUTAN 17+8.csv
Load

### Merge all DataFrames from the list into a single DataFrame

In [4]:
combined_comments_df = pd.concat(df_list, ignore_index=True)

print(f"Total concat: {len(combined_comments_df)}")

combined_comments_path = "combined_comments.csv"
combined_comments_df.to_csv(combined_comments_path, index=False)

Total concat: 1319


In [6]:
combined_quotes_df = pd.concat(quotes_list, ignore_index=True)
print(f"Total concat: {len(combined_quotes_df)}")

combined_quotes_path = "combined_quotes.csv"
combined_quotes_df.to_csv(combined_quotes_path, index=False)

Total concat: 1033


## Load the combined data files into DataFrame

In [7]:
comments = pd.read_csv(combined_comments_path)
quotes = pd.read_csv(combined_quotes_path)

### Check for duplicate data

In [8]:
print(f"Duplicated comments {comments.duplicated().sum()}")
print(f"Duplicated quotes {quotes.duplicated().sum()}")

Duplicated comments 0
Duplicated quotes 0


In [9]:
comments.drop_duplicates(subset='text', keep='first', inplace=True)
print(comments.duplicated().sum())

comments

0


Unnamed: 0,user,handle,text,reply_count,retweet_count,like_count,datetime,time_text
0,"17 tuntutan rakyat dalam satu minggu ke depan,...",,"17 tuntutan rakyat dalam satu minggu ke depan,...",0,0,0,2025-09-01T04:50:30.000Z,"11:50 AM · Sep 1, 2025"
1,dr pemilu kemaren banyak kan bajer sm org2 beg...,3.4K,dr pemilu kemaren banyak kan bajer sm org2 beg...,0,0,0,2025-09-01T11:38:25.000Z,Sep 1
2,UU perampasan aset harus sudah ketok palu maks...,1K,UU perampasan aset harus sudah ketok palu maks...,0,0,0,2025-09-01T15:53:51.000Z,Sep 1
3,kalo gk terpenuhi apa yg bakal di lakuin?,2.1K,kalo gk terpenuhi apa yg bakal di lakuin?,0,0,0,2025-09-01T15:08:00.000Z,Sep 1
4,Jangan cuma pendemo yang dikasih tenggat waktu...,1K,Jangan cuma pendemo yang dikasih tenggat waktu...,0,0,0,2025-09-01T11:20:23.000Z,Sep 1
...,...,...,...,...,...,...,...,...
1314,Paling juga ditunda,249,Paling juga ditunda,0,0,0,2025-09-04T01:40:39.000Z,4 Sep
1315,SURUH MUNDUR. KEPALA DINASTI KOREA BESERTA KOR...,2,SURUH MUNDUR. KEPALA DINASTI KOREA BESERTA KOR...,0,0,0,2025-09-04T04:47:01.000Z,4 Sep
1316,jd konsen kita :\n1. Protes terus hingga polis...,,jd konsen kita : 1. Protes terus hingga polisi...,0,0,0,2025-09-04T19:36:01.000Z,5 Sep
1317,Kita lihat nanti keputusan nya apa\nKarena mer...,13,Kita lihat nanti keputusan nya apa Karena mere...,0,0,0,2025-09-04T03:01:20.000Z,4 Sep


### Remove the duplicate from DataFrame

In [10]:
quotes.drop_duplicates(subset='text', keep='first', inplace=True)
print(quotes.duplicated().sum())
quotes

0


Unnamed: 0,status_id,url,author_handle,author_name,text,reply_count,retweet_count,like_count,datetime,time_text
0,1962387238639239411,https://x.com/rarasayu1992/status/196238723863...,rarasayu1992,"17 tuntutan rakyat dalam satu minggu ke depan,...",Raras @rarasayu1992 · Sep 1 Quote BERSYUKUR AT...,0,0,0,2025-09-01T05:29:11.000Z,Sep 1
1,1962554564449890790,https://x.com/ququyaaaa/status/196255456444989...,ququyaaaa,Pak...mohon dibaca...,Pak...mohon dibaca...,0,0,0,2025-09-01T16:34:05.000Z,Sep 1
2,1962521263097909419,https://x.com/pereirautumn/status/196252126309...,pereirautumn,"Pak, kalau peduli bolehlah lirik ini demi kepe...","Pak, kalau peduli bolehlah lirik ini demi kepe...",0,0,0,2025-09-01T14:21:45.000Z,Sep 1
3,1962720215676182803,https://x.com/racikan_twt/status/1962720215676...,racikan_twt,"Tiap bapak ibu DPR dikritik dalihnya selalu; ""...","Tiap bapak ibu DPR dikritik dalihnya selalu; ""...",0,0,0,2025-09-02T03:32:19.000Z,Sep 2
4,1962813504324501605,https://x.com/tanpakdany/status/19628135043245...,tanpakdany,"Kayak maaf nih, ini tuh tuntutan yg sangat gen...","Kayak maaf nih, ini tuh tuntutan yg sangat gen...",0,0,0,2025-09-02T09:43:01.000Z,Sep 2
...,...,...,...,...,...,...,...,...,...,...
1028,1963414889122480497,https://x.com/titik3bintang/status/19634148891...,titik3bintang,tolong jangan pada bertingkah kalo rapat plz,tolong jangan pada bertingkah kalo rapat plz,0,0,0,2025-09-04T01:32:42.000Z,Sep 4
1029,1963453051660111990,https://x.com/MOJASEIRE/status/196345305166011...,MOJASEIRE,dmn tuch di hotel?,dmn tuch di hotel?,0,0,0,2025-09-04T04:04:21.000Z,Sep 4
1030,1963422152755224771,https://x.com/littlet0mat0/status/196342215275...,littlet0mat0,"Deadlinenya besok ya pak, bu. Yang bener. Kita...","Deadlinenya besok ya pak, bu. Yang bener. Kita...",0,0,0,2025-09-04T02:01:34.000Z,Sep 4
1031,1963429104042348684,https://x.com/AntoniGirsang/status/19634291040...,AntoniGirsang,Wahai Anggota,"Wahai Anggota @DPR_RI ,Ingat dan Bgmn Rakyat...",0,0,0,2025-09-04T02:29:11.000Z,Sep 4


## Merge quote and comments DataFrame

In [11]:
combine_quotes_comments = pd.concat([comments, quotes], ignore_index=True)

print(combine_quotes_comments.duplicated().sum())
combine_quotes_comments

0


Unnamed: 0,user,handle,text,reply_count,retweet_count,like_count,datetime,time_text,status_id,url,author_handle,author_name
0,"17 tuntutan rakyat dalam satu minggu ke depan,...",,"17 tuntutan rakyat dalam satu minggu ke depan,...",0,0,0,2025-09-01T04:50:30.000Z,"11:50 AM · Sep 1, 2025",,,,
1,dr pemilu kemaren banyak kan bajer sm org2 beg...,3.4K,dr pemilu kemaren banyak kan bajer sm org2 beg...,0,0,0,2025-09-01T11:38:25.000Z,Sep 1,,,,
2,UU perampasan aset harus sudah ketok palu maks...,1K,UU perampasan aset harus sudah ketok palu maks...,0,0,0,2025-09-01T15:53:51.000Z,Sep 1,,,,
3,kalo gk terpenuhi apa yg bakal di lakuin?,2.1K,kalo gk terpenuhi apa yg bakal di lakuin?,0,0,0,2025-09-01T15:08:00.000Z,Sep 1,,,,
4,Jangan cuma pendemo yang dikasih tenggat waktu...,1K,Jangan cuma pendemo yang dikasih tenggat waktu...,0,0,0,2025-09-01T11:20:23.000Z,Sep 1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2319,,,tolong jangan pada bertingkah kalo rapat plz,0,0,0,2025-09-04T01:32:42.000Z,Sep 4,1.963415e+18,https://x.com/titik3bintang/status/19634148891...,titik3bintang,tolong jangan pada bertingkah kalo rapat plz
2320,,,dmn tuch di hotel?,0,0,0,2025-09-04T04:04:21.000Z,Sep 4,1.963453e+18,https://x.com/MOJASEIRE/status/196345305166011...,MOJASEIRE,dmn tuch di hotel?
2321,,,"Deadlinenya besok ya pak, bu. Yang bener. Kita...",0,0,0,2025-09-04T02:01:34.000Z,Sep 4,1.963422e+18,https://x.com/littlet0mat0/status/196342215275...,littlet0mat0,"Deadlinenya besok ya pak, bu. Yang bener. Kita..."
2322,,,"Wahai Anggota @DPR_RI ,Ingat dan Bgmn Rakyat...",0,0,0,2025-09-04T02:29:11.000Z,Sep 4,1.963429e+18,https://x.com/AntoniGirsang/status/19634291040...,AntoniGirsang,Wahai Anggota


### Check for duplicate in merged DataFrame

In [12]:
# combine_quotes_comments = combine_quotes_comments[['text']]
combine_quotes_comments['text'].duplicated().sum()

np.int64(8)

### Remove the duplicate in merged DataFrame

In [13]:
combine_quotes_comments.drop_duplicates(subset='text', keep='first', inplace=True)
print(combine_quotes_comments.duplicated().sum())

0


### Remove null values from the merged DataFrame

In [14]:
combine_quotes_comments = combine_quotes_comments.dropna(subset=['text'])
combine_quotes_comments

Unnamed: 0,user,handle,text,reply_count,retweet_count,like_count,datetime,time_text,status_id,url,author_handle,author_name
0,"17 tuntutan rakyat dalam satu minggu ke depan,...",,"17 tuntutan rakyat dalam satu minggu ke depan,...",0,0,0,2025-09-01T04:50:30.000Z,"11:50 AM · Sep 1, 2025",,,,
1,dr pemilu kemaren banyak kan bajer sm org2 beg...,3.4K,dr pemilu kemaren banyak kan bajer sm org2 beg...,0,0,0,2025-09-01T11:38:25.000Z,Sep 1,,,,
2,UU perampasan aset harus sudah ketok palu maks...,1K,UU perampasan aset harus sudah ketok palu maks...,0,0,0,2025-09-01T15:53:51.000Z,Sep 1,,,,
3,kalo gk terpenuhi apa yg bakal di lakuin?,2.1K,kalo gk terpenuhi apa yg bakal di lakuin?,0,0,0,2025-09-01T15:08:00.000Z,Sep 1,,,,
4,Jangan cuma pendemo yang dikasih tenggat waktu...,1K,Jangan cuma pendemo yang dikasih tenggat waktu...,0,0,0,2025-09-01T11:20:23.000Z,Sep 1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2319,,,tolong jangan pada bertingkah kalo rapat plz,0,0,0,2025-09-04T01:32:42.000Z,Sep 4,1.963415e+18,https://x.com/titik3bintang/status/19634148891...,titik3bintang,tolong jangan pada bertingkah kalo rapat plz
2320,,,dmn tuch di hotel?,0,0,0,2025-09-04T04:04:21.000Z,Sep 4,1.963453e+18,https://x.com/MOJASEIRE/status/196345305166011...,MOJASEIRE,dmn tuch di hotel?
2321,,,"Deadlinenya besok ya pak, bu. Yang bener. Kita...",0,0,0,2025-09-04T02:01:34.000Z,Sep 4,1.963422e+18,https://x.com/littlet0mat0/status/196342215275...,littlet0mat0,"Deadlinenya besok ya pak, bu. Yang bener. Kita..."
2322,,,"Wahai Anggota @DPR_RI ,Ingat dan Bgmn Rakyat...",0,0,0,2025-09-04T02:29:11.000Z,Sep 4,1.963429e+18,https://x.com/AntoniGirsang/status/19634291040...,AntoniGirsang,Wahai Anggota


In [17]:
# combine_quotes_comments.duplicated().sum()
print(combine_quotes_comments['text'].isna().sum())

combine_quotes_comments = combine_quotes_comments[['text','datetime','time_text']]
combine_quotes_comments

0


Unnamed: 0,text,datetime,time_text
0,"17 tuntutan rakyat dalam satu minggu ke depan,...",2025-09-01T04:50:30.000Z,"11:50 AM · Sep 1, 2025"
1,dr pemilu kemaren banyak kan bajer sm org2 beg...,2025-09-01T11:38:25.000Z,Sep 1
2,UU perampasan aset harus sudah ketok palu maks...,2025-09-01T15:53:51.000Z,Sep 1
3,kalo gk terpenuhi apa yg bakal di lakuin?,2025-09-01T15:08:00.000Z,Sep 1
4,Jangan cuma pendemo yang dikasih tenggat waktu...,2025-09-01T11:20:23.000Z,Sep 1
...,...,...,...
2319,tolong jangan pada bertingkah kalo rapat plz,2025-09-04T01:32:42.000Z,Sep 4
2320,dmn tuch di hotel?,2025-09-04T04:04:21.000Z,Sep 4
2321,"Deadlinenya besok ya pak, bu. Yang bener. Kita...",2025-09-04T02:01:34.000Z,Sep 4
2322,"Wahai Anggota @DPR_RI ,Ingat dan Bgmn Rakyat...",2025-09-04T02:29:11.000Z,Sep 4


## Save the merged DataFrame as a CSV file

In [None]:
combine_quotes_comments.to_csv("final_data_combined_comments_quotes.csv", index=False)

# concatening comment_quote and post 

In [30]:
post = pd.read_csv("datasets/post_17plus8_X.csv")
comment_quote = pd.read_csv("datasets/final_data_combined_comments_quotes.csv")

print(post.info())
print(comment_quote.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580 entries, 0 to 579
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   conversation_id_str      580 non-null    int64  
 1   created_at               580 non-null    object 
 2   favorite_count           580 non-null    int64  
 3   full_text                580 non-null    object 
 4   id_str                   580 non-null    int64  
 5   image_url                172 non-null    object 
 6   in_reply_to_screen_name  168 non-null    object 
 7   lang                     580 non-null    object 
 8   location                 0 non-null      float64
 9   quote_count              580 non-null    int64  
 10  reply_count              580 non-null    int64  
 11  retweet_count            580 non-null    int64  
 12  tweet_url                580 non-null    object 
 13  user_id_str              580 non-null    int64  
 14  username                 0

## Fixing datetime format

In [31]:
post['created_at'] = pd.to_datetime(post['created_at'])
print("POST INFO(): \n")
print(post.info())

POST INFO(): 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580 entries, 0 to 579
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   conversation_id_str      580 non-null    int64              
 1   created_at               580 non-null    datetime64[ns, UTC]
 2   favorite_count           580 non-null    int64              
 3   full_text                580 non-null    object             
 4   id_str                   580 non-null    int64              
 5   image_url                172 non-null    object             
 6   in_reply_to_screen_name  168 non-null    object             
 7   lang                     580 non-null    object             
 8   location                 0 non-null      float64            
 9   quote_count              580 non-null    int64              
 10  reply_count              580 non-null    int64              
 11  retweet_count    

  post['created_at'] = pd.to_datetime(post['created_at'])


In [32]:
comment_quote['created_at'] = pd.to_datetime(comment_quote['datetime'])
print("comment_quote INFO(): \n")
print(comment_quote.info())

comment_quote INFO(): 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2315 entries, 0 to 2314
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   text        2315 non-null   object             
 1   datetime    2315 non-null   object             
 2   time_text   2315 non-null   object             
 3   created_at  2315 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 72.5+ KB
None


### Rename column for concatening

In [37]:
comment_quote.rename(columns={'text': 'full_text'}, inplace=True)
comment_quote

Unnamed: 0,full_text,datetime,time_text,created_at
0,"17 tuntutan rakyat dalam satu minggu ke depan,...",2025-09-01T04:50:30.000Z,"11:50 AM · Sep 1, 2025",2025-09-01 04:50:30+00:00
1,dr pemilu kemaren banyak kan bajer sm org2 beg...,2025-09-01T11:38:25.000Z,Sep 1,2025-09-01 11:38:25+00:00
2,UU perampasan aset harus sudah ketok palu maks...,2025-09-01T15:53:51.000Z,Sep 1,2025-09-01 15:53:51+00:00
3,kalo gk terpenuhi apa yg bakal di lakuin?,2025-09-01T15:08:00.000Z,Sep 1,2025-09-01 15:08:00+00:00
4,Jangan cuma pendemo yang dikasih tenggat waktu...,2025-09-01T11:20:23.000Z,Sep 1,2025-09-01 11:20:23+00:00
...,...,...,...,...
2310,tolong jangan pada bertingkah kalo rapat plz,2025-09-04T01:32:42.000Z,Sep 4,2025-09-04 01:32:42+00:00
2311,dmn tuch di hotel?,2025-09-04T04:04:21.000Z,Sep 4,2025-09-04 04:04:21+00:00
2312,"Deadlinenya besok ya pak, bu. Yang bener. Kita...",2025-09-04T02:01:34.000Z,Sep 4,2025-09-04 02:01:34+00:00
2313,"Wahai Anggota @DPR_RI ,Ingat dan Bgmn Rakyat...",2025-09-04T02:29:11.000Z,Sep 4,2025-09-04 02:29:11+00:00


In [41]:
comment_quote_ = comment_quote[['created_at', "full_text"]]
post_ = post[['created_at','full_text']]


## Merge the comment-quote and post DataFrame

In [44]:
post_comment_quote = pd.concat([comment_quote_,post_], ignore_index=True)
post_comment_quote

Unnamed: 0,created_at,full_text
0,2025-09-01 04:50:30+00:00,"17 tuntutan rakyat dalam satu minggu ke depan,..."
1,2025-09-01 11:38:25+00:00,dr pemilu kemaren banyak kan bajer sm org2 beg...
2,2025-09-01 15:53:51+00:00,UU perampasan aset harus sudah ketok palu maks...
3,2025-09-01 15:08:00+00:00,kalo gk terpenuhi apa yg bakal di lakuin?
4,2025-09-01 11:20:23+00:00,Jangan cuma pendemo yang dikasih tenggat waktu...
...,...,...
2890,2025-10-29 13:57:30+00:00,Salsa Erwina selain jadi Manajer Strategi Chao...
2891,2025-10-26 02:15:07+00:00,@Zavra077 @Ronnie_Rusli @prabowo Kalo menurutk...
2892,2025-10-27 08:07:15+00:00,Berita lama digoreng lagi. Coba cek BEM UI di ...
2893,2025-10-18 10:13:35+00:00,Katanya efisiensi anggaran sehingga dana untk ...


## Save the merged comment-quote and post DataFrame as a CSV file

In [46]:
post_comment_quote.to_csv('post_comment_quote_17plus8.csv', index=False)