<a href="https://colab.research.google.com/github/AARichburg/Human-AI_Authorship_Analysis/blob/main/Prepare_CoAuthor_data_from_raw_to_plain_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This Colab notebook contains code to download and process the CoAuthor data hosted at (https://coauthor.stanford.edu/) which was used in our paper *Automatic Authorship Analysis in Human-AI Collaborative Writing*. Portions of the code are adopted from the original CoAuthor authors which can also be found at the above link.

In [1]:
!wget https://cs.stanford.edu/~minalee/zip/chi2022-coauthor-v1.0.zip
!unzip -q chi2022-coauthor-v1.0.zip
!rm chi2022-coauthor-v1.0.zip

--2024-03-07 16:13:16--  https://cs.stanford.edu/~minalee/zip/chi2022-coauthor-v1.0.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49956179 (48M) [application/zip]
Saving to: ‘chi2022-coauthor-v1.0.zip’


2024-03-07 16:13:21 (9.08 MB/s) - ‘chi2022-coauthor-v1.0.zip’ saved [49956179/49956179]



In [2]:
import os
import json

def find_writing_sessions(dataset_dir):
    paths = [
        os.path.join(dataset_dir, path)
        for path in os.listdir(dataset_dir)
        if path.endswith('jsonl')
    ]
    return paths


def read_writing_session(path):
    events = []
    with open(path, 'r') as f:
        for event in f:
            events.append(json.loads(event))
    #print(f'Successfully read {len(events)} events in a writing session from {path}')
    return events

dataset_dir = './coauthor-v1.0'
paths = find_writing_sessions(dataset_dir)

The Metadata files used in the next section are also found at the link to the CoAuthor site.  Make sure they are in your path before proceeding.

In [3]:
import pandas as pd
def sess2auth(zip_list):
  out_dict = {}
  for item in zip_list:
    AUTHOR, SESSION = item
    if SESSION not in out_dict:
      out_dict[SESSION.strip()] = AUTHOR.strip()
  return out_dict

df_a = pd.read_csv('/content/CoAuthor - Metadata & Survey - Metadata (argumentative).csv')
df_c = pd.read_csv('/content/CoAuthor - Metadata & Survey - Metadata (creative).csv')
session_id = 'session_id'
worker_id = 'worker_id'
df_a_list = list(zip(list(df_a[worker_id]), list(df_a[session_id])))
df_c_list = list(zip(list(df_c[worker_id]), list(df_c[session_id])))

_, total_list = zip(*df_c_list + df_a_list)

sess_auth_dict = sess2auth(df_a_list + df_c_list)

In [4]:
def apply_ops(doc, mask, ops, source):
    original_doc = doc
    original_mask = mask

    new_doc = ''
    new_mask = ''
    for i, op in enumerate(ops):

        # Handle retain operation
        if 'retain' in op:
            num_char = op['retain']

            retain_doc = original_doc[:num_char]
            retain_mask = original_mask[:num_char]

            original_doc = original_doc[num_char:]
            original_mask = original_mask[num_char:]

            new_doc = new_doc + retain_doc
            new_mask = new_mask + retain_mask

        # Handle insert operation
        elif 'insert' in op:
            insert_doc = op['insert']

            insert_mask = 'U' * len(insert_doc)  # User
            if source == 'api':
                insert_mask = 'A' * len(insert_doc)  # API

            if isinstance(insert_doc, dict):
                if 'image' in insert_doc:
                    print('Skipping invalid object insertion (image)')
                else:
                    print('Ignore invalid insertions:', op)
                    # Ignore other invalid insertions
                    # Debug if necessary
                    pass
            else:
                new_doc = new_doc + insert_doc
                new_mask = new_mask + insert_mask

        # Handle delete operation
        elif 'delete' in op:
            num_char = op['delete']

            if original_doc:
                original_doc = original_doc[num_char:]
                original_mask = original_mask[num_char:]
            else:
                new_doc = new_doc[:-num_char]
                new_mask = new_mask[:-num_char]

        else:
            # Ignore other operations
            # Debug if necessary
            print('Ignore other operations:', op)
            pass

    final_doc = new_doc + original_doc
    final_mask = new_mask + original_mask
    return final_doc, final_mask

In [5]:
def get_text_and_mask(events, event_id, remove_prompt=True):
    prompt = events[0]['currentDoc'].strip()

    text = prompt
    mask = 'P' * len(prompt)  # Prompt
    for event in events[:event_id]:
        if 'ops' not in event['textDelta']:
            continue
        ops = event['textDelta']['ops']
        source = event['eventSource']
        text, mask = apply_ops(text, mask, ops, source)

    if remove_prompt:
        if 'P' not in mask:
            print('=' * 80)
            print('Could not find the prompt in the final text')
            print('-' * 80)
            print('Prompt:', prompt)
            print('-' * 80)
            print('Final text:', text)
            #b = 0
        else:
            end_index = mask.rindex('P')
            text = text[end_index + 1:]
            mask = mask[end_index + 1:]

    return text, mask

In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
import collections
from nltk.tokenize import sent_tokenize

def identify_author(mask):
    if 'P' in mask:
        return 'prompt'
    elif 'U' in mask and 'A' in mask:
        return 'user_and_api'
    elif 'U' in mask and 'A' not in mask:
        return 'user'
    elif 'U' not in mask and 'A' in mask:
        return 'api'
    else:
        raise RuntimeError(f'Could not identify author for this mask: {mask}')

def classify_sentences_by_author(text, mask):
    sentences_by_author = collections.defaultdict(list)
    for sentence_id, sentence in enumerate(sent_tokenize(text.strip())):
        if sentence not in text:
            print(f'Could not find sentence in text: {sentence}')
            continue
        index = text.index(sentence)
        sentence_mask = mask[index:index + len(sentence)]
        author = identify_author(sentence_mask)
        sentences_by_author[author].append({
            'sentence_id': sentence_id,
            'sentence_mask': sentence_mask,
            'sentence_author': author,
            'sentence_text': sentence,
        })
    return sentences_by_author

In [8]:
prompt_shapeshifter = {''.join('A woman has'.split()):'shapeshifter'}# been dating guy after guy, but it never seems to work out.'.split()):'shapeshifter'}# She’s unaware that she’s actually been dating the same guy over and over; a shapeshifter who’s fallen for her, and is certain he’s going to get it right this time.':'shapeshifter'}
prompt_reincarnation = {''.join('When you die,'.split()):'reincarnation'}# you appear in a cinema with a number of other people who look like you.'.split()):'reincarnation'}# You find out that they are your previous reincarnations, and soon you all begin watching your next life on the big screen.':'reincarnation'}
prompt_mana = {''.join('Humans once wielded'.split()):'mana'}# formidable magical power.'.split()):'mana'}# But with over 7 billion of us on the planet now, Mana has spread far too thinly to have any effect. When hostile aliens reduce humanity to a mere fraction, the survivors discover an old power has begun to reawaken once again.':'mana'}
prompt_obama = {''.join("You're Barack Obama.".split()):'obama'}# 4 years into your retirement, you awake to find a letter with no return address on your bedside table. It reads “I hope you’ve had a chance to relax Barack... but pack your bags and call the number below. It’s time to start the real job.” Signed simply, “JFK.”':'obama'}
prompt_pig = {''.join('Once upon a'.split()):'pig'}# time there was an old mother pig who had one hundred little pigs and not enough food to feed them.'.split()):'pig'}# So when they were old enough, she sent them out into the world to seek their fortunes. You know the story about the first three little pigs. This is a story about the 92nd little pig. The 92nd little pig built a house out of depleted uranium. And the wolf was like, “dude.”':'pig'}
prompt_mattdamon = {''.join('An alien has'.split()):'mattdamon'}# kidnapped Matt Damon, not knowing what lengths humanity goes through to retrieve him whenever he goes missing.'.split()):'mattdamon'}
prompt_sideeffect = {''.join("When you're 28,".split()):'sideffect'}# science discovers a drug that stops all effects of aging, creating immortality.".split()):'sideeffect'}# Your government decides to give the drug to all citizens under 26, but you and the rest of the “Lost Generations” are deemed too high-risk. When you’re 85, the side effects are finally discovered.':'sideeffect'}
prompt_bee = {''.join("Your entire life,".split()):'bee'}# you've been told you're deathly allergic to bees.".split()):'bee'}# You’ve always had people protecting you from them, be it your mother or a hired hand. Today, one slips through and lands on your shoulder. You hear a tiny voice say “Your Majesty, what are your orders?”':'bee'}
prompt_dad = {''.join("All of the".split()):'dad'}# '""#1 Dad"' mugs in the world change to show the actual ranking of Dads suddenly.".split()):'dad'}

prompt_isolation = {''.join("Following World War".split()):'isolation'}
prompt_screen = {''.join('How Worried Should'.split()):'screen'}# We Be About Screen Time During the Pandemic?'.split()):'screen'}# The coronavirus pandemic ended the screen time debate: Screens won. We all now find ourselves on our screens for school, for work and for connecting with family and friends during this time of social distancing and increased isolation. But should we be worried about this excessive screen use right now? Or should we finally get over it and embrace the benefits of our digital devices?':'screen'}
prompt_dating = {''.join('How Do You'.split()):'dating'}# Think Technology Affects Dating?'.split()):'dating'}# Have you had any experience with dating? Have you ever used dating apps? If so, what has it been like for you? If not, why not? How do you think technology — like apps, Netflix, social media and texting — affects dating and relationships? In your opinion, does it improve or worsen romantic interactions? How so?':'dating'}
prompt_pads = {''.join('Should Schools Provide'.split()):'pads'}# Free Pads and Tampons?'.split()):'pads'}# Have you ever experienced period shaming, or “period poverty”? Should schools step in to help? Should schools be required to provide free pads and tampons to students? How are pads and tampons similar to toilet paper, soap, Band-Aids and other products that are already provided in schools? How are they different?':'pads'}
prompt_school = {''.join('What Are the'.split()):'school'}# Most Important Things Students Should Learn in School?'.split()):'school'}# In your opinion, what are the most important things students should learn in school? What is the most important thing you have learned in school? How has this knowledge affected your life? How do you think it will help your success in the future?':'school'}
prompt_stereotype = {''.join('What Stereotypical Characters'.split()):'stereotype'}# Make You Cringe?'.split()):'stereotype'}# What stereotypical characters in books, movies or television shows make you cringe and why? Would you ever not watch or read something because of its offensive portrayal of someone?':'stereotype'}
prompt_audiobook = {''.join('Is Listening to'.split()):'audiobook'}# a Book Just as Good as Reading It?'.split()):'audiobook'}# Do you listen to audiobooks? What are the benefits, in your opinion, of listening instead of reading? Are there advantages to reading that cannot be gained by listening? Which method do you prefer? Why?':'audiobook'}
prompt_athletes = {''.join('Should College Athletes'.split()):'athletes'}# Be Paid?'.split()):'athletes'}# Do you think college athletes should be paid? Or is a college scholarship and other non-monetary perks like the opportunity to play in front of cheering fans enough? [...] What possible difficulties or downsides might there be in providing monetary compensation to players?':'athletes'}
prompt_extremesports = {''.join('Is It Selfish'.split()):'extremesports'}# to Pursue Risky Sports Like Extreme Mountain Climbing?'.split()):'extremesports'}# Some sports, like extreme mountain climbing, are dangerous. Since there are varying degrees of risk in most, if not all, sports (such as the possibility of concussions, broken bones and even death), how does one decide where the line might be drawn between what is reasonable and what is not? Are some sports simply too dangerous to be called a sport?':'extremesports'}
prompt_animal = {''.join('Is It Wrong'.split()):'animal'}# to Focus on Animal Welfare When Humans Are Suffering?'.split()):'animal'}# Would you be surprised to hear that a study found that research subjects were more upset by stories of a dog beaten by a baseball bat than of an adult similarly beaten? Or that other researchers found that if forced to choose, 40 percent of people would save their pet dog over a foreign tourist. Why do you think many people are more empathetic toward the suffering of animals than that of people? In your opinion, is it wrong to focus on animal welfare when humans are suffering? Why do you think so?':'animal'}
prompt_news = {''.join("Are We Being".split()):'news'}# Bad Citizens If We Don't Keep Up With the News?".split()):'news'}# In your opinion, are we being bad citizens if we don’t keep up with the news? Do you think all people have some responsibility to know what is going on in the world? Does engaging with current events actually do anything at all? Why do you think the way you do?':'news'}

prompt_dict = prompt_isolation | prompt_dad | prompt_shapeshifter | prompt_reincarnation | prompt_mana | prompt_obama | prompt_pig | prompt_mattdamon | prompt_sideeffect | prompt_bee | prompt_screen | prompt_dating | prompt_pads | prompt_school | prompt_stereotype | prompt_audiobook | prompt_athletes | prompt_extremesports | prompt_audiobook | prompt_athletes | prompt_extremesports | prompt_animal | prompt_news
################################################################################
from nltk.tokenize import word_tokenize
def get_text_stats(a_dict):
  u_txt = []
  a_txt = []
  ua_txt = []
  for key in a_dict:
    sub_list = a_dict[key]
    for item in sub_list:
      sentence = item['sentence_text']
      sentence = word_tokenize(sentence)
      if key == 'user':
        u_txt.append(sentence)
      elif key == 'api':
        a_txt.append(sentence)
      else:
        ua_txt.append(sentence)
  return u_txt, a_txt, ua_txt

def deal_with_zero_div(num,dem):
  if dem == 0 and num == 0:
    out = 0
  elif dem == 0 and num != 0:
    out = 'na'
  else:
    out = float(num) / dem
  return out

def compute_stats(list_of_tok_sent):
  tokens = []
  num_of_sent = len(list_of_tok_sent)
  for sent in list_of_tok_sent:
    for token in sent:
      tokens.append(token)
  num_of_tokens = len(tokens)
  num_of_types = len(set(tokens))
  return num_of_sent, num_of_tokens, num_of_types

def PLACEHOLDER(a_list):
  session_id, user_lines, api_lines, user_api_lines = a_list
  num_u_sent, num_u_tok, num_u_type = compute_stats(user_lines)
  num_a_sent, num_a_tok, num_a_type = compute_stats(api_lines)
  num_ua_sent, num_ua_tok, num_ua_type = compute_stats(user_api_lines)
  return session_id, num_u_sent, num_u_tok, num_u_type, num_a_sent, num_a_tok, num_a_type, num_ua_sent, num_ua_tok, num_ua_type

import pandas as pd
def session_dataframe(sentences_by_author,session_id,author_id,granularity='session',prompt_dict=prompt_dict):
  dataframe_out = pd.DataFrame()
  prompt_info = sentences_by_author['prompt']
  if len(prompt_info) > 0:
    prompt_first = ''.join(prompt_info[0]['sentence_text'].split()[:3])
    prompt_label = prompt_dict.get(prompt_first, 'misc')
  else:
    prompt_label = 'misc'
  for author_type in ['user','api','user_and_api']:
    sentence_group = sentences_by_author[author_type]
    if len(sentence_group) == 0:
      dict_to_keep = [{'sentence_author': author_type,'sentence_text': 'NO_DATA'}]
    else:
      keys_to_keep = ['sentence_id', 'sentence_author', 'sentence_text']
      dict_to_keep = [{key: sentence_group[i][key] for key in keys_to_keep} for i in range(len(sentence_group))]
    sub_df = pd.DataFrame.from_dict(dict_to_keep)
    if granularity == 'segment':
      dataframe_entry = sub_df
    elif granularity == 'session':
      text_block = ' '.join(sub_df['sentence_text'])
      dataframe_entry = pd.DataFrame.from_dict([{'sentence_author':author_type,'sentence_text':text_block}])
    dataframe_out = pd.concat([dataframe_out, dataframe_entry], ignore_index=True)
  dataframe_out['session_id'] = session_id
  dataframe_out['author_id'] = author_id
  dataframe_out['prompt_id'] = prompt_label
  return dataframe_out

import json
import os
def write_json(target_path, target_file, data):
    if not os.path.exists(target_path):
        try:
            os.makedirs(target_path)
        except Exception as e:
            print(e)
            raise
    with open(os.path.join(target_path, target_file), 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

This section collates the data and saves CSV files at the session and segment levels.

In [9]:
import pandas as pd
auth_dict = {}
_, arg_idx = zip(*df_a_list)
_, crt_idx = zip(*df_c_list)
data_dir="CoAuthor_data"
dataframe_to_save = pd.DataFrame()

for granularity in ['session','segment']:
  for idx in range(len(paths)):
    session_id = paths[idx].split('/')[2].split('.')[0].strip()
    file_prefix="session-" + str(session_id)
    author = sess_auth_dict.get(session_id,'missing_info')
    author_dir = author + '_dir'
    events = read_writing_session(paths[idx])
    text, mask = get_text_and_mask(events, len(events), remove_prompt=False)
    sentences_by_author = classify_sentences_by_author(text, mask)
    author_session_df = session_dataframe(sentences_by_author, session_id, author,granularity=granularity)
    dataframe_to_save = pd.concat([dataframe_to_save, author_session_df])

  dataframe_to_save.to_csv('CoAuthor_Data_'+granularity+'.csv', index=False)

Skipping invalid object insertion (image)
Skipping invalid object insertion (image)
