In [2]:
datapath = "./dataset/" #path to the dataset folder

In [3]:
import pandas as pd
import regex
from polyglot.text import Text, Word
from tqdm import tqdm

In [None]:
multisocial = pd.read_csv(datapath + f'multisocial.csv.gz', lineterminator='\n')

In [None]:
models = ['vicuna-13b', 'aya-101', 'v5-Eagle-7B-HF', 'Mistral-7B-Instruct-v0.2', 'opt-iml-max-30b', 'gpt-3.5-turbo-0125', 'gemini']#, 'Llama-2-70b-chat-hf']
datasets = {}

for model in models:
  temp = pd.read_csv(datapath + f'multisocial_{model}_paraphrased_{model}_paraphrased_{model}.csv.gz', lineterminator='\n')
  temp.fillna("", inplace=True)
  temp['text'] = multisocial['text']
  temp.to_csv(f'multisocial_3x_{model}.csv.gz')
  datasets[model] = temp

In [None]:
#check for any duplicates in human data
multisocial[multisocial.text.duplicated(keep=False)]

Unnamed: 0,text,label,length,source,language,domain,topic,split


In [None]:
#remove whitespaces around texts
def clear_dataset(df):
  df_string_columns = df.select_dtypes(['object'])
  df[df_string_columns.columns] = df_string_columns.apply(lambda x: x.str.strip())
  return df

#remove some unicode chars making problems in polyglot
#https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790
def remove_bad_chars(text):
  RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
  return RE_BAD_CHARS.sub("", text)

#shorten generated texts
def shorten_text(row, index):
  human = str(row.generated).strip()
  if (human == ''):
    return human
  human_words = Text(human, hint_language_code=row.language).words
  human_length = len(human_words)

  while ((human_length > 200)): #remove last word if text longer than 200 words
    human = human[:-len(str(human_words[-1]))].strip()
    human_words = human_words[:-1]
    human_length = len(human_words)
  return human

#unify dataset form
def unify_form(dataset, model):
  dataset = clear_dataset(dataset)
  dataset['label'] = model
  dataset['text'] = dataset['generated']
  dataset['length'] = [len(x.split()) if (y != 'zh') or (x == '') else len(Text(x, hint_language_code=y).words) for (x, y) in zip(dataset.text, dataset.language)]
  dataset['source'] = [f'multisocial_{x}' for x in dataset.source]
  return dataset

#uniqueness/repetitiveness - get number of unique sentences in row.text
def unique_sentences(row):
  if row.text == '':
    return 0
  sentences = Text(row.text, hint_language_code=row.language).sentences
  return len(set(sentences)) / len(sentences)

#uniqueness/repetitiveness - get number of unique words in row.text
def unique_words(row):
  if row.text == '':
    return 0
  words = Text(row.text, hint_language_code=row.language).words
  return len(set(words)) / len(words)

In [None]:
%%time
stat = {}
for model, dataset in datasets.items():
  print(f'Processing {model}')

  dataset['generated'] = dataset['generated'].apply(lambda x: remove_bad_chars(x))
  dataset['generated'] = [shorten_text(row, index) for index, row in tqdm(dataset.iterrows(), total=len(dataset))]
  empty_generation = len(dataset[(dataset.generated == '') | (dataset.generated == 'nan') | dataset.generated.isna()])

  dataset = unify_form(dataset, model)

  shorts = len(dataset[dataset.length < 2])
  duplicates = len(dataset[dataset.text.duplicated(keep=False)])

  dataset['unique_sentences'] = [unique_sentences(row) for index, row in tqdm(dataset.iterrows())]
  dataset['unique_words'] = [unique_words(row) for index, row in tqdm(dataset.iterrows())]

  stat[model] = {'empty_generation': empty_generation, 'short_texts': shorts, 'duplicates': duplicates, 'wordcount_mean': dataset.length.mean(), 'wordcount_std': dataset.length.std(), 'unique_sentences_mean': dataset.unique_sentences.mean(), 'unique_sentences_std': dataset.unique_sentences.std(), 'unique_words_mean': dataset.unique_words.mean(), 'unique_words_std': dataset.unique_words.std()}
  multisocial = pd.concat([multisocial, dataset], ignore_index=True, copy=False)

Processing vicuna-13b


100%|███████████████████████████████████████████████████████████████████████████| 61592/61592 [00:07<00:00, 8292.73it/s]
61592it [00:06, 9445.37it/s]
61592it [00:07, 8061.28it/s]


Processing aya-101


100%|███████████████████████████████████████████████████████████████████████████| 61592/61592 [00:11<00:00, 5351.54it/s]
61592it [00:06, 9215.78it/s] 
61592it [00:07, 8379.50it/s]


Processing v5-Eagle-7B-HF


100%|███████████████████████████████████████████████████████████████████████████| 61592/61592 [00:08<00:00, 7439.65it/s]
61592it [00:06, 9066.17it/s]
61592it [00:08, 7574.23it/s]


Processing Mistral-7B-Instruct-v0.2


100%|███████████████████████████████████████████████████████████████████████████| 61592/61592 [00:08<00:00, 7500.32it/s]
61592it [00:06, 9174.98it/s]
61592it [00:07, 7932.89it/s]


Processing opt-iml-max-30b


100%|██████████████████████████████████████████████████████████████████████████| 61592/61592 [00:06<00:00, 10115.83it/s]
61592it [00:06, 9861.52it/s] 
61592it [00:06, 9744.23it/s] 


Processing gpt-3.5-turbo-0125


100%|███████████████████████████████████████████████████████████████████████████| 61592/61592 [00:07<00:00, 8587.34it/s]
61592it [00:06, 9103.77it/s]
61592it [00:07, 8028.57it/s]


Processing gemini


100%|█████████████████████████████████████████████████████████████████████████████| 61592/61592 [35:09<00:00, 29.19it/s]
61592it [00:09, 6563.67it/s]
61592it [00:14, 4350.81it/s]


CPU times: user 37min 27s, sys: 29.5 s, total: 37min 56s
Wall time: 37min 53s


In [None]:
multisocial.label.value_counts()

human                       61592
vicuna-13b                  61592
aya-101                     61592
v5-Eagle-7B-HF              61592
Mistral-7B-Instruct-v0.2    61592
opt-iml-max-30b             61592
gpt-3.5-turbo-0125          61592
gemini                      61592
Name: label, dtype: int64

In [None]:
#are instruction-based prompts removed? it's ok
multisocial[multisocial.text.str.contains('You are a helpful assistent.')].label.value_counts()

opt-iml-max-30b    37
aya-101            16
Name: label, dtype: int64

In [None]:
multisocial[multisocial.text.str.lower().str.contains('assistent')].label.value_counts()

v5-Eagle-7B-HF              562
opt-iml-max-30b             434
gpt-3.5-turbo-0125          328
aya-101                     258
Mistral-7B-Instruct-v0.2    100
vicuna-13b                   50
gemini                       14
human                         2
Name: label, dtype: int64

In [None]:
multisocial[multisocial.text.str.lower().str.contains('instruction')].label.value_counts()

gemini                      71
v5-Eagle-7B-HF              45
Mistral-7B-Instruct-v0.2    31
gpt-3.5-turbo-0125          15
vicuna-13b                  11
human                        2
opt-iml-max-30b              2
Name: label, dtype: int64

In [None]:
multisocial[multisocial.text.str.lower().str.contains('task')].label.value_counts()

gemini                      176
v5-Eagle-7B-HF              146
Mistral-7B-Instruct-v0.2    116
vicuna-13b                   65
gpt-3.5-turbo-0125           54
human                        30
opt-iml-max-30b              30
aya-101                      29
Name: label, dtype: int64

In [None]:
multisocial[multisocial.text.str.lower().str.contains('social media text')].label.value_counts()

gemini                      367
v5-Eagle-7B-HF              190
Mistral-7B-Instruct-v0.2     11
aya-101                       5
opt-iml-max-30b               5
vicuna-13b                    2
Name: label, dtype: int64

In [None]:
pd.options.display.float_format = "{:,.2f}".format
pd.DataFrame(stat).T

Unnamed: 0,empty_generation,short_texts,duplicates,wordcount_mean,wordcount_std,unique_sentences_mean,unique_sentences_std,unique_words_mean,unique_words_std
vicuna-13b,194.0,550.0,408.0,17.46,14.7,1.0,0.06,0.91,0.12
aya-101,1558.0,1821.0,2108.0,11.86,12.92,0.97,0.16,0.9,0.19
v5-Eagle-7B-HF,15.0,287.0,28.0,22.13,17.17,1.0,0.03,0.88,0.11
Mistral-7B-Instruct-v0.2,14.0,110.0,85.0,18.76,14.21,1.0,0.02,0.89,0.11
opt-iml-max-30b,1126.0,2197.0,1939.0,8.76,8.64,0.98,0.14,0.92,0.17
gpt-3.5-turbo-0125,13.0,34.0,2965.0,20.73,20.76,1.0,0.02,0.91,0.11
gemini,16.0,126.0,36.0,71.08,54.01,0.99,0.05,0.73,0.16


In [None]:
multisocial[~multisocial.label.str.contains('human')].length.describe()

count   431,144.00
mean         24.39
std          31.58
min           0.00
25%           7.00
50%          13.00
75%          27.00
max         200.00
Name: length, dtype: float64

In [None]:
multisocial[multisocial.label.str.contains('human')].length.describe()

count   61,592.00
mean        14.75
std         34.17
min          3.00
25%          4.00
50%          6.00
75%         15.00
max      1,366.00
Name: length, dtype: float64

In [None]:
multisocial[multisocial.label.str.contains('human')].length.max()

1366

In [None]:
#shorten human texts
def shorten_text(row, index):
  human = str(row.text).strip()
  if (human == ''):
    return human
  human_words = Text(human, hint_language_code=row.language).words
  human_length = len(human_words)

  while ((human_length > 200)): #remove last word if text longer than 200 words
    human = human[:-len(str(human_words[-1]))].strip()
    human_words = human_words[:-1]
    human_length = len(human_words)
  return human

In [None]:
row = multisocial.iloc[46025]
human = str(row.text).strip()
Text(human, hint_language_code=row.language).words
print(row.text)
print('*****************SHORTENED******************\n', shorten_text(row, 0))

*****************SHORTENED******************
 ⛱ *_SELF INCOME_*  ⛱ *_TEAM INCOME_*                🏖 *_AUTO INCOME_*  🏖 *_VIDEO WALL_*  ⛱ *_SURVEY wEALL*   ⛱ *_CLICK AND EARN_*  🏖 *_SHOPPING INCOME_*   🏖 *_OFFER WALL_*  ⛱ *_OPINION WALL,AND  ⛱MANY MORE!!!!!!!!_* 🏖 *_JAI COOLEBIZ_*🏖  *COOLEBIZ App*   🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁  *_कुलेबिज़ 110% income & success देगा_* *_1 month मेहनत करो minimum 100 लोगो को जोडो daily 500+ earning आऐगा_*  *LEVEL INCOME 10 LEVEL तक* *_FIXED INCOME_*  *Self           100%* *1st  level.  30%* *2nd level. 20%* *3rd  level.  10%* *4th  level.  5%* *5th


In [None]:
#fix human texts

dataset = multisocial[multisocial.label.str.contains('human')].copy()
dataset['text'] = dataset['text'].apply(lambda x: remove_bad_chars(x).strip())
dataset['text'] = [shorten_text(row, index) for index, row in tqdm(dataset.iterrows(), total=len(dataset))]
dataset['length'] = [len(x.split()) if (y != 'zh') or (x == '') else len(Text(x, hint_language_code=y).words) for (x, y) in zip(dataset.text, dataset.language)]
dataset['unique_sentences'] = [unique_sentences(row) for index, row in tqdm(dataset.iterrows(), total=len(dataset))]
dataset['unique_words'] = [unique_words(row) for index, row in tqdm(dataset.iterrows(), total=len(dataset))]
multisocial.loc[multisocial.label.str.contains('human'),:] = dataset

100%|████████████████████████████████████████████████████████████████████████████| 61592/61592 [04:32<00:00, 226.20it/s]
100%|███████████████████████████████████████████████████████████████████████████| 61592/61592 [00:07<00:00, 8244.94it/s]
100%|███████████████████████████████████████████████████████████████████████████| 61592/61592 [00:07<00:00, 8626.94it/s]


In [None]:
empty_generation = len(dataset[(dataset.text == '') | (dataset.text == 'nan') | dataset.text.isna()])
shorts = len(dataset[dataset.length < 3])
duplicates = len(dataset[dataset.text.duplicated(keep=False)])

stat['human'] = {'empty_generation': empty_generation, 'short_texts': shorts, 'duplicates': duplicates, 'wordcount_mean': dataset.length.mean(), 'wordcount_std': dataset.length.std(), 'unique_sentences_mean': dataset.unique_sentences.mean(), 'unique_sentences_std': dataset.unique_sentences.std(), 'unique_words_mean': dataset.unique_words.mean(), 'unique_words_std': dataset.unique_words.std()}

In [None]:
pd.DataFrame(stat).T

Unnamed: 0,empty_generation,short_texts,duplicates,wordcount_mean,wordcount_std,unique_sentences_mean,unique_sentences_std,unique_words_mean,unique_words_std
vicuna-13b,194.0,550.0,408.0,17.46,14.7,1.0,0.06,0.91,0.12
aya-101,1558.0,1821.0,2108.0,11.86,12.92,0.97,0.16,0.9,0.19
v5-Eagle-7B-HF,15.0,287.0,28.0,22.13,17.17,1.0,0.03,0.88,0.11
Mistral-7B-Instruct-v0.2,14.0,110.0,85.0,18.76,14.21,1.0,0.02,0.89,0.11
opt-iml-max-30b,1126.0,2197.0,1939.0,8.76,8.64,0.98,0.14,0.92,0.17
gpt-3.5-turbo-0125,13.0,34.0,2965.0,20.73,20.76,1.0,0.02,0.91,0.11
gemini,16.0,126.0,36.0,71.08,54.01,0.99,0.05,0.73,0.16
human,0.0,3591.0,27.0,12.83,19.21,1.0,0.01,0.9,0.14


In [None]:
temp = pd.DataFrame(stat).T.reset_index()
print(temp.to_latex(index=False, na_rep=0, escape=False, formatters={"text": str.lower}, float_format="{:.2f}".format))

\begin{tabular}{lrrrrrrrrr}
\toprule
                   index &  empty_generation &  short_texts &  duplicates &  wordcount_mean &  wordcount_std &  unique_sentences_mean &  unique_sentences_std &  unique_words_mean &  unique_words_std \\
\midrule
              vicuna-13b &            194.00 &       550.00 &      408.00 &           17.46 &          14.70 &                   1.00 &                  0.06 &               0.91 &              0.12 \\
                 aya-101 &           1558.00 &      1821.00 &     2108.00 &           11.86 &          12.92 &                   0.97 &                  0.16 &               0.90 &              0.19 \\
          v5-Eagle-7B-HF &             15.00 &       287.00 &       28.00 &           22.13 &          17.17 &                   1.00 &                  0.03 &               0.88 &              0.11 \\
Mistral-7B-Instruct-v0.2 &             14.00 &       110.00 &       85.00 &           18.76 &          14.21 &                   1.00 &           

  print(temp.to_latex(index=False, na_rep=0, escape=False, formatters={"text": str.lower}, float_format="{:.2f}".format))


In [None]:
multisocial[multisocial.label.str.contains('human')].length.describe()

count   61,592.00
mean        12.83
std         19.21
min          1.00
25%          3.00
50%          6.00
75%         14.00
max        200.00
Name: length, dtype: float64

In [None]:
multisocial[~multisocial.label.str.contains('human')].length.describe()

count   431,144.00
mean         24.39
std          31.58
min           0.00
25%           7.00
50%          13.00
75%          27.00
max         200.00
Name: length, dtype: float64

In [None]:
multisocial.head()

Unnamed: 0.1,text,label,length,source,language,domain,topic,split,generated,unique_sentences,unique_words,Unnamed: 0
0,@demo_demo_nl @op1npo De 'wetenschappelijke' m...,human,15,twitter,nl,social_media,unknown,train,,1.0,0.85,
1,群众是真正的英雄，而我们自己则往往是幼稚可笑的，不了解这一点，就不能得到起码的知识。 ◦ 《...,human,56,telegram,zh,social_media,unknown,train,,1.0,0.86,
2,"Вот пример, шунич виноват в голе Дзюбы?",human,7,telegram,ru,social_media,unknown,train,,1.0,1.0,
3,no cp wont load,human,4,twitter,en,social_media,unknown,train,,1.0,1.0,
4,20 de julio,human,3,whatsapp,es,social_media,unknown,train,,1.0,1.0,


In [None]:
multisocial.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
multisocial.label.value_counts()

human                       61592
vicuna-13b                  61592
aya-101                     61592
v5-Eagle-7B-HF              61592
Mistral-7B-Instruct-v0.2    61592
opt-iml-max-30b             61592
gpt-3.5-turbo-0125          61592
gemini                      61592
Name: label, dtype: int64

In [None]:
#delete empty and too-short (less than 3 words) texts
multisocial.drop(columns=['generated', 'unique_sentences',	'unique_words'], inplace=True)
multisocial.loc[multisocial.text == "nan", "text"] = pd.NA
multisocial.loc[multisocial.text == "", "text"] = pd.NA
multisocial.dropna(inplace=True)
multisocial = multisocial[multisocial.length > 2]

In [None]:
multisocial.label.value_counts()

Mistral-7B-Instruct-v0.2    61346
gpt-3.5-turbo-0125          61256
gemini                      61236
v5-Eagle-7B-HF              61079
vicuna-13b                  60393
human                       58001
aya-101                     57933
opt-iml-max-30b             56003
Name: label, dtype: int64

In [None]:
#delete text duplicates
multisocial = multisocial.drop_duplicates(subset=['text'])

In [None]:
multisocial.label.value_counts()

Mistral-7B-Instruct-v0.2    61305
v5-Eagle-7B-HF              61057
vicuna-13b                  60346
gemini                      59559
gpt-3.5-turbo-0125          58851
human                       57990
aya-101                     57566
opt-iml-max-30b             55423
Name: label, dtype: int64

In [None]:
multisocial.language.value_counts()

es    50781
en    50756
pt    45178
nl    40605
de    30848
ro    24536
ru    24080
pl    23711
ar    23585
ca    21612
hr    20606
hu    20508
bg    20378
et    19913
cs    17267
zh    11404
uk    10388
el     7464
ga     3153
sl     3072
sk     2033
gd      219
Name: language, dtype: int64

In [None]:
multisocial.groupby(['label'])['language'].value_counts()

label                     language
Mistral-7B-Instruct-v0.2  en          6497
                          es          6495
                          pt          5769
                          nl          5232
                          de          3965
                                      ... 
vicuna-13b                el           968
                          ga           457
                          sl           389
                          sk           260
                          gd            32
Name: language, Length: 176, dtype: int64

In [None]:
multisocial.split.value_counts()

train    331209
test     140888
Name: split, dtype: int64

In [None]:
multisocial.groupby(['split'])['language'].value_counts()

split  language
test   en          11730
       pt          11725
       es          11700
       nl           9631
       de           9603
       ro           9393
       ru           8016
       ar           7989
       ca           7471
       pl           6991
       cs           6127
       hr           6039
       hu           5968
       et           5419
       bg           4893
       el           3650
       zh           3480
       ga           3153
       sl           3072
       uk           2586
       sk           2033
       gd            219
train  es          39081
       en          39026
       pt          33453
       nl          30974
       de          21245
       pl          16720
       ru          16064
       ar          15596
       bg          15485
       ro          15143
       hr          14567
       hu          14540
       et          14494
       ca          14141
       cs          11140
       zh           7924
       uk           7802
       el

In [None]:
multisocial['multi_label'] = multisocial['label'].copy()
multisocial['label'] = int(0)
multisocial.loc[~multisocial.multi_label.str.contains('human'), 'label'] = int(1)
multisocial = multisocial.sample(frac=1., random_state = 0).reset_index(drop=True)
multisocial.to_csv(datapath + f'multisocial_v1.csv.gz', index=False)

In [None]:
multisocial.head()

Unnamed: 0,text,label,length,source,language,domain,topic,split,multi_label
0,"Hola, ¿qué tal? - ¿Qué tal?",1,6,multisocial_discord,es,social_media,unknown,train,aya-101
1,The breathtaking picture shared by @sashagrey ...,1,10,multisocial_twitter,en,social_media,unknown,train,v5-Eagle-7B-HF
2,Πρέπει να πέσετε στην σ,1,5,multisocial_telegram,el,social_media,unknown,train,opt-iml-max-30b
3,"Дійсно, християнство передбачає дотримання пев...",1,53,multisocial_telegram,uk,social_media,unknown,train,gemini
4,I Your patience is still being put to the test...,1,25,multisocial_telegram,cs,social_media,unknown,train,Mistral-7B-Instruct-v0.2


In [None]:
print(multisocial[multisocial.multi_label.str.contains('human')].length.describe())
print(multisocial[~multisocial.multi_label.str.contains('human')].length.describe())

count   57,990.00
mean        13.50
std         19.53
min          3.00
25%          4.00
50%          7.00
75%         15.00
max        200.00
Name: length, dtype: float64
count   414,107.00
mean         25.10
std          31.88
min           3.00
25%           8.00
50%          14.00
75%          27.00
max         200.00
Name: length, dtype: float64
