In [1]:
import pandas as pd
import numpy as np
import plotly as plt
import re

In [2]:
def snake_case(s):
  return '_'.join(
    re.sub('([A-Z][a-z]+)', r' \1',
    re.sub('([A-Z]+)', r' \1',
    s.replace('-', ' '))).split()).lower()

In [3]:
q_raw_results = pd.ExcelFile('results/results.xlsx')

In [4]:
q_file = pd.read_excel('results/questionnaires.xlsx')
q_file

Unnamed: 0,question_id,questionnaire_id,keyword,question,generated_by
0,778,1,Brexit,Do you agree with the Brexit result?,human
1,620,1,Exams,Are you in favor of exams starting at a specif...,gpt3
2,845,1,Premier league,Who do you think will win the Premier League t...,human
3,87,1,Womens football,Do you think that women's football is a sport ...,gpt3
4,178,1,Kanye West,What is your opinion on Kanye West?,gpt3
...,...,...,...,...,...
295,163,10,Ghosts,Do you believe in ghosts?,gpt3
296,68,10,Exams,What is your opinion on exams?,gpt3
297,684,10,Influencers,Is influencer a real job?,human
298,737,10,Batgirl,Did you ever see a Batgirl comic?,human


In [5]:
num_respondants = 0
q_full_results = pd.DataFrame()
q_meta_responses = pd.DataFrame()

for sheet_name in q_raw_results.sheet_names:
    q_id = int(re.search(r'form_responses_(\d+)', snake_case(sheet_name)).group(1))
    
    df = q_raw_results.parse(sheet_name)
    df.columns = df.columns.str.strip()
    df = df.drop(['Timestamp', 'Score'], axis=1)
    df  = df.T
    df.index.name = 'question'
    df = df.reset_index()
        
    responses = df.columns.values.tolist()
    responses.remove('question')
    response_df = pd.DataFrame()

    for response_value in responses:
        num_respondants += 1
        new = pd.DataFrame()

        new['response'] = df[response_value].map(lambda x: 'human' if x == 'Human' else ('gpt3' if x == 'Computer' else (x.lower() if isinstance(x, str) else x)))
        new['question'] = df['question']
        new['respondant_id'] = num_respondants
        new['questionnaire_id'] = q_id
        
        response_df = pd.concat([response_df, new]).reset_index(drop=True)
        
    if not response_df.empty:
        q_meta_df = response_df.query('question == "Have you heard of GPT-3 before?" | question == "Please rate your understanding of how language models, such as GPT-3 work?"')
        response_df = response_df[~response_df.isin(q_meta_df)].dropna(how = 'all')
        response_df['respondant_id'] = response_df['respondant_id'].astype('int32')
        response_df['questionnaire_id'] = response_df['questionnaire_id'].astype('int32')
        
        q_meta_responses = pd.concat([q_meta_responses, q_meta_df]).reset_index(drop=True)
        q_full_results = pd.concat([q_full_results, response_df]).reset_index(drop=True)

q_raw_results.close()
q_full_results

Unnamed: 0,response,question,respondant_id,questionnaire_id
0,human,Do you agree with the Brexit result?,1,1
1,human,Are you in favor of exams starting at a specif...,1,1
2,human,Who do you think will win the Premier League t...,1,1
3,gpt3,Do you think that women's football is a sport ...,1,1
4,gpt3,What is your opinion on Kanye West?,1,1
...,...,...,...,...
595,gpt3,Do you believe in ghosts?,20,10
596,gpt3,What is your opinion on exams?,20,10
597,human,Is influencer a real job?,20,10
598,gpt3,Did you ever see a Batgirl comic?,20,10


In [6]:
q_meta_responses

Unnamed: 0,response,question,respondant_id,questionnaire_id
0,no,Have you heard of GPT-3 before?,1,1
1,1,Please rate your understanding of how language...,1,1
2,no,Have you heard of GPT-3 before?,2,1
3,1,Please rate your understanding of how language...,2,1
4,no,Have you heard of GPT-3 before?,3,2
5,1,Please rate your understanding of how language...,3,2
6,no,Have you heard of GPT-3 before?,4,2
7,1,Please rate your understanding of how language...,4,2
8,yes,Have you heard of GPT-3 before?,5,3
9,4,Please rate your understanding of how language...,5,3


In [7]:
q_merged = q_file.merge(q_full_results, on=['question', 'questionnaire_id']).reset_index(drop=True)

failed_merges = q_file.merge(q_full_results, how='left', on=['question', 'questionnaire_id']).reset_index(drop=True)
failed_merges[failed_merges.isna().any(axis=1)]

Unnamed: 0,question_id,questionnaire_id,keyword,question,generated_by,response,respondant_id


In [8]:
q_merged['correct'] = q_merged.apply(lambda x: True if x['generated_by'] == x['response'] else False, axis=1)
q_merged

Unnamed: 0,question_id,questionnaire_id,keyword,question,generated_by,response,respondant_id,correct
0,778,1,Brexit,Do you agree with the Brexit result?,human,human,1,True
1,778,1,Brexit,Do you agree with the Brexit result?,human,human,2,True
2,620,1,Exams,Are you in favor of exams starting at a specif...,gpt3,human,1,False
3,620,1,Exams,Are you in favor of exams starting at a specif...,gpt3,gpt3,2,True
4,845,1,Premier league,Who do you think will win the Premier League t...,human,human,1,True
...,...,...,...,...,...,...,...,...
595,684,10,Influencers,Is influencer a real job?,human,human,20,True
596,737,10,Batgirl,Did you ever see a Batgirl comic?,human,gpt3,19,False
597,737,10,Batgirl,Did you ever see a Batgirl comic?,human,gpt3,20,False
598,243,10,Dating apps,Do you think dating apps are a good way to mee...,gpt3,human,19,False


In [9]:
respondant_scores = q_merged[q_merged['correct']][['respondant_id', 'correct']]\
                    .groupby('respondant_id')\
                    .sum()\
                    .rename(columns={'correct': 'score'})
print('#########################################################')
print(respondant_scores)
print('#########################################################')
print(f'min score: {int(np.min(respondant_scores, axis=0))}')
print(f'max score: {int(np.max(respondant_scores, axis = 0))}')
print(f'average score: {np.average(respondant_scores)}')
print(f'99%: {np.percentile(respondant_scores, 99)}')
print(f'95%: {np.percentile(respondant_scores, 95)}')
print(f'90%: {np.percentile(respondant_scores, 90)}')
print(f'75%: {np.percentile(respondant_scores, 75)}')
print(f'60%: {np.percentile(respondant_scores, 60)}')
print(f'50%: {np.percentile(respondant_scores, 50)}')
print(f'25%: {np.percentile(respondant_scores, 25)}')
print(f'10%: {np.percentile(respondant_scores, 10)}')
print('#########################################################')

#########################################################
               score
respondant_id       
1                 20
2                 14
3                 15
4                 19
5                 14
6                 14
7                 13
8                 15
9                 15
10                15
11                19
12                17
13                15
14                16
15                13
16                11
17                21
18                17
19                16
20                19
#########################################################
min score: 11
max score: 21
average score: 15.9
99%: 20.81
95%: 20.05
90%: 19.1
75%: 17.5
60%: 16.0
50%: 15.0
25%: 14.0
10%: 13.0
#########################################################


In [10]:
questionnaire_average_scores = q_merged[['respondant_id', 'questionnaire_id', 'correct']]\
                    .groupby(['respondant_id', 'questionnaire_id'])\
                    .apply(lambda group: sum(group['correct']))\
                    .groupby('questionnaire_id')\
                    .mean()

print('#########################################################')
print(questionnaire_average_scores)
print('#########################################################')

#########################################################
questionnaire_id
1     17.0
2     17.0
3     14.0
4     13.0
5     15.0
6     16.5
7     16.0
8     12.0
9     19.0
10    17.5
dtype: float64
#########################################################


In [11]:
total_gpt3 = q_merged.query('generated_by == "gpt3"').shape[0]
total_human = q_merged.query('generated_by == "human"').shape[0]

num_identified_gpt3 = q_merged.query('generated_by == "gpt3" and response == "gpt3"').shape[0]
print(f'Correctly identified as gpt3: {num_identified_gpt3} - {(num_identified_gpt3/total_gpt3) * 100}%')

num_misidentified_gpt3 = q_merged.query('generated_by == "human" and response == "gpt3"').shape[0]
print(f'Incorrectly idenfitied as gpt3: {num_misidentified_gpt3} - {(num_misidentified_gpt3/total_human) * 100}%')

num_identified_human = q_merged.query('generated_by == "human" and response == "human"').shape[0]
print(f'Correctly identified as human: {num_identified_human} - {(num_identified_human/total_human) * 100}%')

num_misidentified_human = q_merged.query('generated_by == "gpt3" and response == "human"').shape[0]
print(f'Incorrectly identified as human: {num_misidentified_human} - {(num_misidentified_human/total_gpt3) * 100}%')

Correctly identified as gpt3: 146 - 48.66666666666667%
Incorrectly idenfitied as gpt3: 128 - 42.66666666666667%
Correctly identified as human: 172 - 57.333333333333336%
Incorrectly identified as human: 154 - 51.33333333333333%
