In [1]:
import os
print(os.getcwd())

/home/jupyter-dowonkim/parler


In [2]:
import glob
import json
import pandas as pd

# set directory path 
ndjson_directory = '/home/jupyter-dowonkim/parler'

ndjson_files = glob.glob(f'{ndjson_directory}/*.ndjson')


In [3]:
# retrieve fields that I need from each ndjson file 
for file_path in ndjson_files:
    extracted_data = []

    with open(file_path, 'r') as file:
        for line in file:
            try:
                data = json.loads(line)

                # body 필드가 빈 문자열이면 건너뜁니다.
                if data.get('body', '') == '':
                    continue

                # urls 데이터에서 필요한 정보를 추출합니다.
                domain, modified, short = '', '', ''
                if data.get('urls') and len(data['urls']) > 0:
                    url_data = data['urls'][0]
                    domain = url_data.get('domain', '')
                    modified = url_data.get('metadata', {}).get('modified', '')
                    short = url_data.get('short', '')

                selected_data = {
                    'body': data.get('body', ''),
                    'createdAtformatted': data.get('createdAtformatted', ''),
                    'hashtags': data.get('hashtags', []),
                    'upvotes': data.get('upvotes', 0),
                    'username': data.get('username', ''),
                    'verified': data.get('verified', False),
                    'impressions': data.get('impressions', 0),
                    'reposts': data.get('reposts', 0),
                    'creator': data.get('creator', ''),
                    'followers': data.get('followers', 0),
                    'following': data.get('following', 0),
                    'domain': domain,
                    'modified': modified,
                    'short': short
                }


                extracted_data.append(selected_data)
            except json.JSONDecodeError:
                # there could be decoding errors and I just skip those files
                continue

    df = pd.DataFrame(extracted_data)

    csv_file_name = file_path.split('/')[-1].replace('.ndjson', '.csv')
    csv_file_path = f'/home/jupyter-dowonkim/parler/extracted_data/{csv_file_name}'

    df.to_csv(csv_file_path, index=False)

In [4]:
import pandas as pd
import glob

# set directory path 
csv_directory = '/home/jupyter-dowonkim/parler/extracted_data/'
csv_files = glob.glob(f'{csv_directory}/*.csv')

# combine all csv files into one combinded dataframe 
combined_df = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

combined_csv_path = '/home/jupyter-dowonkim/parler/combined_data.csv'
combined_df.to_csv(combined_csv_path, index=False)


In [5]:
# drop this column since there is no entry
combined_df = combined_df.drop(columns=['modified'])

In [6]:
# filter only the cases relevant to #STC 
def contains_keywords(row):
    keywords = ["saveourchildren", "savethechildren", "save the children", "save our children", "savethebabies", "save the babies"]

    if any(keyword in str(row['body']) for keyword in keywords):
        return True

    if any(keyword in str(hashtag) for hashtag in row['hashtags'] for keyword in keywords):
        return True

    return False

# Filter dataframe 
filtered_df = combined_df[combined_df.apply(contains_keywords, axis=1)]


In [7]:
# save
filtered_df.to_csv('/home/jupyter-dowonkim/parler/filtered_data.csv', index=False)

In [8]:
filtered_df

Unnamed: 0,body,createdAtformatted,hashtags,upvotes,username,verified,impressions,reposts,creator,followers,following,domain,short
174,Anyone seen these 2 Pedo's recentlly?????\n\n#...,2020-12-10 22:57:54 UTC,"['wwg1wga', 'maga', 'saveourchildren', 'redpil...",10,Hmichaud635841514226,False,792,14,88f5c528012e4a75bdb8c40d4baef097,438,25,newspunch.com,https://par.pw/l/GrbHa
1069,"Brian Kemp, if ads were honest. 😉\n\n#Train05 ...",2020-11-24 17:40:51 UTC,"['train05', 'mickeysdepot', 'cd9tc', 'fatherof...",160,twoHollowFangs,False,17000,130,b3a00a13c263480b91ba55f64cdd8432,3800,2500,streamable.com,https://api.parler.com/l/y9yfK
3242,OFFICIAL RECALL PETITION: Recall Gavin Newsom ...,2021-01-02 04:52:27 UTC,"['bidencheated', 'election2020', 'electionfrau...",21,PhanHuy888888,False,2800,12,5c7fb4559fcb4d75b006f8267cbca003,1500,0,recallgavin2020.com,https://par.pw/l/mUDqH
4046,Want to save the children? Start by standing u...,2020-08-10 20:15:42 UTC,[],50,Lizpevytoe,False,2900,23,a250b18ee7f64e99b8fbfb75ab2dc570,3300,3900,,https://video.parler.com/P7/sV/P7sVgYP2zLEp.mp4
4338,#saveourchildren,2020-11-01 20:45:21 UTC,['saveourchildren'],38,Curls6,False,2100,25,3f0a054fe8974c369a9017a60193f4d0,10000,18000,theblaze.com,https://api.parler.com/l/qSsh7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1707058,🙌\n#saveourchildren\n🧸 \n\n178 arrested in chi...,2020-11-17 23:54:13 UTC,['saveourchildren'],30,Deadp00l,False,1300,16,b27e4fb9ab854afb8ff5cf64b7b77ab6,14000,9900,fox13news.com,https://api.parler.com/l/sapUb
1707091,#savethechildren #Lulz #Anonymous #Canada #pol...,2020-10-01 13:08:17 UTC,"['savethechildren', 'lulz', 'anonymous', 'cana...",47,MadMaxCa,False,2200,37,bfba2b19743b4099a86a24ce89c78e1d,3600,4800,foxbusiness.com,https://api.parler.com/l/5hdpD
1711720,#covidhoax2020 #covidisover #saveourchildren #...,2020-11-02 18:18:59 UTC,"['covidhoax2020', 'covidisover', 'saveourchild...",0,Lifeforce369,False,0,0,34eb5109ae704fafabf80952619a5cb4,246,286,bitchute.com,https://api.parler.com/l/Ge0n2
1713860,down with the deep state\n\nMUST WATCH!! PROOF...,2020-08-22 02:00:57 UTC,"['parler', 'wwg1wga', 'trump', 'trump2020', 'p...",2,DrGaryAlan,False,0,0,146d5797e6544d2996424fd06ede3209,2000,177,videos.utahgunexchange.com,https://api.parler.com/l/kUzVK


In [9]:
# install the textblob package
!pip install textblob -q
# import the textblob package
from textblob import TextBlob

In [10]:
# filtered_df의 'body' 열에 있는 각 내용에 대해 sentiment analysis를 수행하고 결과를 새로운 열에 저장
filtered_df['naive_bayes'] = filtered_df['body'].apply(lambda text: TextBlob(text).sentiment_assessments)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['naive_bayes'] = filtered_df['body'].apply(lambda text: TextBlob(text).sentiment_assessments)


In [11]:
!pip install empath -q

In [12]:
from empath import Empath
lexicon = Empath()

In [13]:
empath_results = []

for text in filtered_df['body']:
    result = lexicon.analyze(text, normalize=True)
    empath_results.append(result)

# 결과를 filtered_df에 추가
filtered_df['empath'] = empath_results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['empath'] = empath_results


In [14]:
## Too big
## Let's refine the columns first
# load data
import pandas as pd
filtered_df = pd.read_csv('/home/jupyter-dowonkim/parler/result.csv')

In [15]:
filtered_df.iloc[0]

body                  In the police body cam footage, his wife Ouley...
createdAtformatted                              2020-12-23 18:51:35 UTC
hashtags                                                    ['warnock']
upvotes                                                              13
username                                                TheProudPatriot
verified                                                          False
impressions                                                         999
reposts                                                              13
creator                                85d380abbcc94eb0a8d1fbaa3a9edd81
followers                                                          7300
following                                                           682
domain                                                    wabcradio.com
short                                    https://api.parler.com/l/60E2t
naive_bayes           Sentiment(polarity=0.45714285714285713, su

In [16]:
import ast

# 'empath' 열의 문자열을 사전으로 변환하고, 'negative_emotion'과 'positive_emotion' 키만 필터링
def filter_empath_keys(empath_str):
    # 문자열을 사전으로 변환
    empath_dict = ast.literal_eval(empath_str)
    # 'negative_emotion'과 'positive_emotion' 키만 필터링
    return {k: v for k, v in empath_dict.items() if k in ['negative_emotion', 'positive_emotion']}

filtered_empath = filtered_df['empath'].apply(filter_empath_keys)


In [17]:
#replace
filtered_df['empath'] = filtered_empath

In [18]:
# 'empath' 열에서 'positive_emotion'과 'negative_emotion' 값을 추출하고, 없으면 0을 할당하는 함수
def extract_empath_values(empath_dict, key):
    return empath_dict.get(key, 0)
    
# 새로운 열 생성
filtered_df['empath_positive'] = filtered_df['empath'].apply(lambda x: extract_empath_values(x, 'positive_emotion'))
filtered_df['empath_negative'] = filtered_df['empath'].apply(lambda x: extract_empath_values(x, 'negative_emotion'))

# remove empath column
filtered_df.drop('empath', axis=1, inplace=True)


In [19]:
filtered_df

Unnamed: 0,body,createdAtformatted,hashtags,upvotes,username,verified,impressions,reposts,creator,followers,following,domain,short,naive_bayes,empath_positive,empath_negative
0,"In the police body cam footage, his wife Ouley...",2020-12-23 18:51:35 UTC,['warnock'],13,TheProudPatriot,False,999,13,85d380abbcc94eb0a8d1fbaa3a9edd81,7300,682,wabcradio.com,https://api.parler.com/l/60E2t,"Sentiment(polarity=0.45714285714285713, subjec...",0.010870,0.000000
1,Anyone seen these 2 Pedo's recentlly?????\n\n#...,2020-12-10 22:57:54 UTC,"['wwg1wga', 'maga', 'saveourchildren', 'redpil...",10,Hmichaud635841514226,False,792,14,88f5c528012e4a75bdb8c40d4baef097,438,25,newspunch.com,https://par.pw/l/GrbHa,"Sentiment(polarity=0.0, subjectivity=0.0, asse...",0.000000,0.000000
2,A Palestinian woman on TV shows her explosive ...,2020-07-13 06:24:41 UTC,[],89,loveIsrael1,False,8400,80,663c124d46dd415fb3012e1134cf4311,48000,87000,politicsonline.net,https://par.pw/l/O9pEu,"Sentiment(polarity=0.05357142857142857, subjec...",0.000000,0.019608
3,Poor destitute women and children fleeing a wa...,2020-10-20 14:43:35 UTC,[],436,TommyRobinson,True,12000,210,b1ad1b19a70d43ffa9c21a1b650022a5,305000,240,,https://video.parler.com/WN/D6/WND68WY9kDWd_sm...,"Sentiment(polarity=-0.1708333333333333, subjec...",0.000000,0.062500
4,The data is clear: Our students are at incredi...,2020-07-29 14:49:53 UTC,[],39,DanForestNC,False,1900,15,0009ae87100c4a7cb71a1e314874b860,3300,7,image-cdn.parler.com,https://par.pw/l/QdJSX,"Sentiment(polarity=0.21666666666666665, subjec...",0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13423,‪I have never asked my followers to retweet. H...,2020-08-03 19:05:03 UTC,"['1', 'humantrafficking', 'endhumantrafficking...",1,Talessandra,False,0,0,ed6c64f39bc440919e7e2c0db66a170a,197,115,facebook.com,https://par.pw/l/6Ozdw,"Sentiment(polarity=0.10833333333333334, subjec...",0.000000,0.000000
13424,Democrats are lying sociopaths ally with Supre...,2019-07-04 18:58:06 UTC,[],0,Mikehatesevil,False,0,0,5790609987cd4ecc89fd992a4681ee04,9500,15000,i.imgur.com,https://par.pw/l/SCX3s,"Sentiment(polarity=-0.35, subjectivity=0.625, ...",0.000000,0.054545
13425,It's obvious that some conservative people car...,2020-07-15 00:50:57 UTC,[],8,Ramann2010,False,0,0,460637fd07824cc48d76dd892d649557,13000,16000,boomerspeaks.com,https://par.pw/l/urodD,"Sentiment(polarity=-0.01875, subjectivity=0.33...",0.024390,0.048780
13426,Maybe so but what reason does Joan Rivers have...,2020-12-09 01:20:10 UTC,[],0,Gatorcoastie,False,0,0,2fc0e045cdf944958f5141f59df24bd5,23000,45000,americasfreedomfighters.com,https://par.pw/l/blacC,"Sentiment(polarity=-0.25, subjectivity=0.53333...",0.035714,0.035714


In [20]:
import re

# 'naive_bayes' 열의 문자열에서 'polarity' 값을 추출하는 함수
def extract_polarity(sentiment_str):
    # 정규 표현식을 사용하여 polarity 값을 찾음
    match = re.search(r"polarity=(-?\d+\.\d+)", sentiment_str)
    if match:
        return float(match.group(1))
    else:
        return None

# 적용
filtered_df['naive_bayes_polarity'] = filtered_df['naive_bayes'].apply(extract_polarity)


In [21]:
# save
filtered_df.to_csv('/home/jupyter-dowonkim/parler/result2.csv', index=False)

In [22]:
filtered_df

Unnamed: 0,body,createdAtformatted,hashtags,upvotes,username,verified,impressions,reposts,creator,followers,following,domain,short,naive_bayes,empath_positive,empath_negative,naive_bayes_polarity
0,"In the police body cam footage, his wife Ouley...",2020-12-23 18:51:35 UTC,['warnock'],13,TheProudPatriot,False,999,13,85d380abbcc94eb0a8d1fbaa3a9edd81,7300,682,wabcradio.com,https://api.parler.com/l/60E2t,"Sentiment(polarity=0.45714285714285713, subjec...",0.010870,0.000000,0.457143
1,Anyone seen these 2 Pedo's recentlly?????\n\n#...,2020-12-10 22:57:54 UTC,"['wwg1wga', 'maga', 'saveourchildren', 'redpil...",10,Hmichaud635841514226,False,792,14,88f5c528012e4a75bdb8c40d4baef097,438,25,newspunch.com,https://par.pw/l/GrbHa,"Sentiment(polarity=0.0, subjectivity=0.0, asse...",0.000000,0.000000,0.000000
2,A Palestinian woman on TV shows her explosive ...,2020-07-13 06:24:41 UTC,[],89,loveIsrael1,False,8400,80,663c124d46dd415fb3012e1134cf4311,48000,87000,politicsonline.net,https://par.pw/l/O9pEu,"Sentiment(polarity=0.05357142857142857, subjec...",0.000000,0.019608,0.053571
3,Poor destitute women and children fleeing a wa...,2020-10-20 14:43:35 UTC,[],436,TommyRobinson,True,12000,210,b1ad1b19a70d43ffa9c21a1b650022a5,305000,240,,https://video.parler.com/WN/D6/WND68WY9kDWd_sm...,"Sentiment(polarity=-0.1708333333333333, subjec...",0.000000,0.062500,-0.170833
4,The data is clear: Our students are at incredi...,2020-07-29 14:49:53 UTC,[],39,DanForestNC,False,1900,15,0009ae87100c4a7cb71a1e314874b860,3300,7,image-cdn.parler.com,https://par.pw/l/QdJSX,"Sentiment(polarity=0.21666666666666665, subjec...",0.000000,0.000000,0.216667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13423,‪I have never asked my followers to retweet. H...,2020-08-03 19:05:03 UTC,"['1', 'humantrafficking', 'endhumantrafficking...",1,Talessandra,False,0,0,ed6c64f39bc440919e7e2c0db66a170a,197,115,facebook.com,https://par.pw/l/6Ozdw,"Sentiment(polarity=0.10833333333333334, subjec...",0.000000,0.000000,0.108333
13424,Democrats are lying sociopaths ally with Supre...,2019-07-04 18:58:06 UTC,[],0,Mikehatesevil,False,0,0,5790609987cd4ecc89fd992a4681ee04,9500,15000,i.imgur.com,https://par.pw/l/SCX3s,"Sentiment(polarity=-0.35, subjectivity=0.625, ...",0.000000,0.054545,-0.350000
13425,It's obvious that some conservative people car...,2020-07-15 00:50:57 UTC,[],8,Ramann2010,False,0,0,460637fd07824cc48d76dd892d649557,13000,16000,boomerspeaks.com,https://par.pw/l/urodD,"Sentiment(polarity=-0.01875, subjectivity=0.33...",0.024390,0.048780,-0.018750
13426,Maybe so but what reason does Joan Rivers have...,2020-12-09 01:20:10 UTC,[],0,Gatorcoastie,False,0,0,2fc0e045cdf944958f5141f59df24bd5,23000,45000,americasfreedomfighters.com,https://par.pw/l/blacC,"Sentiment(polarity=-0.25, subjectivity=0.53333...",0.035714,0.035714,-0.250000


In [23]:
!pip install transformers -q

In [24]:
# filtered_df의 각 body 행에 대한 감성 분석 수행
sentiment_scores = []

In [25]:
# realized that emojis cause errors and transformers only can deal with less than 512 tokens
import re

# 이모지를 제거하는 함수
def remove_emojis(text):
    # 이모지에 해당하는 유니코드 패턴
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# 'body_no_emoji' 열 생성
filtered_df['body_no_emoji'] = filtered_df['body'].apply(remove_emojis)


In [26]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

2023-12-11 18:54:41.107245: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-11 18:54:41.138675: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-11 18:54:41.138707: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-11 18:54:41.138731: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-11 18:54:41.146020: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: A

In [27]:
from transformers import PreTrainedTokenizerFast

# 토크나이저 초기화 (모델에 맞는 토크나이저 사용)
tokenizer = PreTrainedTokenizerFast.from_pretrained("j-hartmann/emotion-english-distilroberta-base")

# 텍스트를 512 토큰까지만 자르는 함수
def truncate_text(text):
    return tokenizer.decode(tokenizer.encode(text, max_length=512, truncation=True))


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [28]:

# 'body_cut' 열 생성
filtered_df['body_cut'] = filtered_df['body_no_emoji'].apply(truncate_text)


In [29]:
filtered_df

Unnamed: 0,body,createdAtformatted,hashtags,upvotes,username,verified,impressions,reposts,creator,followers,following,domain,short,naive_bayes,empath_positive,empath_negative,naive_bayes_polarity,body_no_emoji,body_cut
0,"In the police body cam footage, his wife Ouley...",2020-12-23 18:51:35 UTC,['warnock'],13,TheProudPatriot,False,999,13,85d380abbcc94eb0a8d1fbaa3a9edd81,7300,682,wabcradio.com,https://api.parler.com/l/60E2t,"Sentiment(polarity=0.45714285714285713, subjec...",0.010870,0.000000,0.457143,"In the police body cam footage, his wife Ouley...","<s>In the police body cam footage, his wife Ou..."
1,Anyone seen these 2 Pedo's recentlly?????\n\n#...,2020-12-10 22:57:54 UTC,"['wwg1wga', 'maga', 'saveourchildren', 'redpil...",10,Hmichaud635841514226,False,792,14,88f5c528012e4a75bdb8c40d4baef097,438,25,newspunch.com,https://par.pw/l/GrbHa,"Sentiment(polarity=0.0, subjectivity=0.0, asse...",0.000000,0.000000,0.000000,Anyone seen these 2 Pedo's recentlly?????\n\n#...,<s>Anyone seen these 2 Pedo's recentlly?????\n...
2,A Palestinian woman on TV shows her explosive ...,2020-07-13 06:24:41 UTC,[],89,loveIsrael1,False,8400,80,663c124d46dd415fb3012e1134cf4311,48000,87000,politicsonline.net,https://par.pw/l/O9pEu,"Sentiment(polarity=0.05357142857142857, subjec...",0.000000,0.019608,0.053571,A Palestinian woman on TV shows her explosive ...,<s>A Palestinian woman on TV shows her explosi...
3,Poor destitute women and children fleeing a wa...,2020-10-20 14:43:35 UTC,[],436,TommyRobinson,True,12000,210,b1ad1b19a70d43ffa9c21a1b650022a5,305000,240,,https://video.parler.com/WN/D6/WND68WY9kDWd_sm...,"Sentiment(polarity=-0.1708333333333333, subjec...",0.000000,0.062500,-0.170833,Poor destitute women and children fleeing a wa...,<s>Poor destitute women and children fleeing a...
4,The data is clear: Our students are at incredi...,2020-07-29 14:49:53 UTC,[],39,DanForestNC,False,1900,15,0009ae87100c4a7cb71a1e314874b860,3300,7,image-cdn.parler.com,https://par.pw/l/QdJSX,"Sentiment(polarity=0.21666666666666665, subjec...",0.000000,0.000000,0.216667,The data is clear: Our students are at incredi...,<s>The data is clear: Our students are at incr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13423,‪I have never asked my followers to retweet. H...,2020-08-03 19:05:03 UTC,"['1', 'humantrafficking', 'endhumantrafficking...",1,Talessandra,False,0,0,ed6c64f39bc440919e7e2c0db66a170a,197,115,facebook.com,https://par.pw/l/6Ozdw,"Sentiment(polarity=0.10833333333333334, subjec...",0.000000,0.000000,0.108333,‪I have never asked my followers to retweet. H...,<s>‪I have never asked my followers to retweet...
13424,Democrats are lying sociopaths ally with Supre...,2019-07-04 18:58:06 UTC,[],0,Mikehatesevil,False,0,0,5790609987cd4ecc89fd992a4681ee04,9500,15000,i.imgur.com,https://par.pw/l/SCX3s,"Sentiment(polarity=-0.35, subjectivity=0.625, ...",0.000000,0.054545,-0.350000,Democrats are lying sociopaths ally with Supre...,<s>Democrats are lying sociopaths ally with Su...
13425,It's obvious that some conservative people car...,2020-07-15 00:50:57 UTC,[],8,Ramann2010,False,0,0,460637fd07824cc48d76dd892d649557,13000,16000,boomerspeaks.com,https://par.pw/l/urodD,"Sentiment(polarity=-0.01875, subjectivity=0.33...",0.024390,0.048780,-0.018750,It's obvious that some conservative people car...,<s>It's obvious that some conservative people ...
13426,Maybe so but what reason does Joan Rivers have...,2020-12-09 01:20:10 UTC,[],0,Gatorcoastie,False,0,0,2fc0e045cdf944958f5141f59df24bd5,23000,45000,americasfreedomfighters.com,https://par.pw/l/blacC,"Sentiment(polarity=-0.25, subjectivity=0.53333...",0.035714,0.035714,-0.250000,Maybe so but what reason does Joan Rivers have...,<s>Maybe so but what reason does Joan Rivers h...


In [30]:
filtered_df['body_cut'][0]

'<s>In the police body cam footage, his wife Ouleye Ndoye, says #Warnock ran over her foot while the two were having an argument and he was trying to drive off with their two children in the back seat. The incident happened in March.\n\n“This man is running for the United States Senate and all he cares about right now is his reputation,” Ndoye is heard telling the officer in the video. \n\nShe also says, that her husband is “a great actor” and “phenomenal at putting on a really good show.” </s>'

In [31]:

# 각 텍스트에 대해 분류기를 실행하고 결과를 저장
for i, text in enumerate(filtered_df['body_cut']):
    try:
        # 분류기 실행
        result = classifier(text)
        
        # 각 레이블에 대한 점수를 데이터프레임에 저장
        for item in result[0]:
            filtered_df.at[i, item['label']] = item['score']
    except Exception as e:
        print(f"Error at index {i}: {e}")

In [32]:
filtered_df.to_csv('/home/jupyter-dowonkim/parler/result2.csv', index=False)

In [33]:
print(filtered_df.columns)

Index(['body', 'createdAtformatted', 'hashtags', 'upvotes', 'username',
       'verified', 'impressions', 'reposts', 'creator', 'followers',
       'following', 'domain', 'short', 'naive_bayes', 'empath_positive',
       'empath_negative', 'naive_bayes_polarity', 'body_no_emoji', 'body_cut',
       'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'],
      dtype='object')


In [34]:
# 추출할 컬럼 이름들
selected_columns = [
    'body', 'createdAtformatted', 'upvotes', 'username',
    'verified', 'impressions', 'reposts', 'creator', 'followers',
    'following', 'domain', 'empath_positive',
    'empath_negative', 'naive_bayes_polarity', 
    'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'
]

# 새로운 데이터프레임 생성
new_df = filtered_df[selected_columns].copy()

# 결과 확인
print(new_df.head())


                                                body       createdAtformatted  \
0  In the police body cam footage, his wife Ouley...  2020-12-23 18:51:35 UTC   
1  Anyone seen these 2 Pedo's recentlly?????\n\n#...  2020-12-10 22:57:54 UTC   
2  A Palestinian woman on TV shows her explosive ...  2020-07-13 06:24:41 UTC   
3  Poor destitute women and children fleeing a wa...  2020-10-20 14:43:35 UTC   
4  The data is clear: Our students are at incredi...  2020-07-29 14:49:53 UTC   

   upvotes              username  verified  impressions  reposts  \
0       13       TheProudPatriot     False          999       13   
1       10  Hmichaud635841514226     False          792       14   
2       89           loveIsrael1     False         8400       80   
3      436         TommyRobinson      True        12000      210   
4       39           DanForestNC     False         1900       15   

                            creator  followers  following  ...  \
0  85d380abbcc94eb0a8d1fbaa3a9edd81   

In [35]:
new_df.to_csv('/home/jupyter-dowonkim/parler/new_df.csv', index=False)