In [1]:
import pickle
import pandas as pd
from pandas import Series
from tqdm import tqdm
import seaborn as sns
import numpy as np

import matplotlib
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import font_manager as fm
from matplotlib import rc


matplotlib.rcParams['figure.dpi'] = 150
f_path = "/System/Library/Fonts/Helvetica.ttc"
font_name = fm.FontProperties(fname=f_path).get_name()
rc('font', family=font_name, size=13)

## Read classification results

In [15]:
df = pd.read_pickle('../dataset/03_Final_dataframe/df_ddo_with_isbelief_info(N232691).p')
len(df)

232691

In [16]:
df = df[~df.duplicated()]
df.to_pickle('../dataset/03_Final_dataframe/df_ddo_with_isbelief_info_nodup(N226847).p')

In [18]:
print('data size:', len(df))
print('num debates:', len(df['debate_key'].unique()))
print('num debate title:', len(df['debate_title'].unique()))
print('num users:', len(df['username'].unique()))
print('average participation:', len(df)/len(df['username'].unique())  )

data size: 226847
num debates: 78362
num debate title: 68900
num users: 44510
average participation: 5.096540103347563


In [19]:
df.head()

Unnamed: 0,debate_key,debate_title,username,debate_date,position,is_belief
0,.-.-.-Ha-YOURE-GOING-DOWN-BEEM0R/1/,". . . Ha! YOU'RE GOING DOWN, BEEM0R!",Logical-Master,2009-02-02,Pro,0.0
1,.-Audis-are-junkers-except-to-rich-kids-with-l...,". Audis are junkers, except to rich kids with ...",Max.Wallace,2014-09-04,Pro,1.0
2,....-Former-Secretary-of-State-Madeleine-Albri...,"....""Former Secretary of State Madeleine Albri...",Lookingatissues,2017-01-30,Pro,1.0
3,...Words-can-t-hurt-me-any./1/,...Words can't hurt me any.,NonInDelicto,2007-12-19,Pro,1.0
4,.9-repeated-is-equal-to-1./1/,.9 repeated is equal to 1.,cowpie1998,2011-04-07,Pro,1.0


In [31]:
df_true = df[df['is_belief']==1]
df_false = df[df['is_belief']==0]

true_beliefs = pd.Series(df_true['debate_title'].unique())
false_beliefs = pd.Series(df_false['debate_title'].unique())

In [32]:
len(df_true), len(df_false)

(192307, 34497)

In [57]:
df_sampled = pd.concat([true_beliefs.sample(25), false_beliefs.sample(25)], ignore_index=1)

In [59]:
df_sampled = pd.DataFrame(df_sampled, columns=['debate_title'])
gpt_labels = np.concatenate((np.ones(25), np.zeros(25)))
df_sampled['GPT-4_Answer'] = gpt_labels.astype(int)
df_sampled

Unnamed: 0,debate_title,GPT-4_Answer
0,"The Qur'an is not the truth, Jesus is God",1
1,tom brady is better than peyton manning,1
2,Any government attempts to bail out homeowners...,1
3,Metal Detectors in Schools?,1
4,Government should get out of the marriage busi...,1
5,Should Jerusalem be a Jewish territory,1
6,The United States needs Universal Healthcare.,1
7,"Should we, as christians, Doubt the Bible some...",1
8,"Since good secure more happiness, we should li...",1
9,Beauty is only skin deep.,1


In [60]:
# Shuffle the DataFrame
shuffled_df = df_sampled.sample(frac=1, random_state=42)  # `random_state` is optional

# Reset the index if you want to also reorder the index
shuffled_df = shuffled_df.reset_index(drop=True)

In [61]:
shuffled_df

Unnamed: 0,debate_title,GPT-4_Answer
0,"In the United States, Minor Jail Time Ought to...",1
1,should gay marriage be legal?,0
2,Stand your ground.,0
3,Prove that the god according to the bible love...,0
4,Do or Do Not there is no try. Do you Agree?,1
5,Batman (pro) VS Deadpool (con),0
6,CBA Child Abuse,0
7,Design Your Prison,0
8,Eugenics,0
9,"Is rap/hip-hop music ""bad""?",1


In [63]:
shuffled_df.to_csv('../dataset/03_belief_filtering_GPT4_result_essential_data/gpt-4_belief_classification.csv', index=False)