# Data Exploration

In [76]:
import pandas as pd
import matplotlib.pyplot as plt

### Cognitive ability test questions

In [77]:
# cognitive ability test questions
test_items = pd.read_csv('data/test_items.csv')
print(test_items.columns)
print(test_items.shape)

display(test_items)

Index(['Unnamed: 0', 'ID', 'text', 'option_1', 'option_2', 'option_3',
       'option_4', 'option_correct'],
      dtype='object')
(28, 8)


Unnamed: 0.1,Unnamed: 0,ID,text,option_1,option_2,option_3,option_4,option_correct
0,q178,178,Which is bigger?,The earth,The sun,,,2
1,q255,255,STALE is to STEAL as 89475 is to...,89457,98547,89754,89547,4
2,q1201,1201,"What is next in this series? 1, 4, 10, 19, 31, _",36,48,46,Don't know / don't care,3
3,q14835,14835,"If you turn a left-handed glove inside out, it...",On my left hand,On my right hand,,,2
4,q8672,8672,In the line 'Wherefore art thou Romeo?' what d...,Why,Where,How,Who cares / wtf?,1
5,q438128,438128,"In a lake, there is a patch of lily pads. Ever...",24 days,25 days,47 days,48 days,3
6,q438126,438126,lf it takes 5 machines 5 minutes to make 5 wid...,5 minutes,10 minutes,50 minutes,100 minutes,1
7,q438117,438117,A bat and a ball cost $1.10.\nThe bat costs on...,$0.10,$1.00,$0.05,$0.15,3
8,q339289,339289,Sixteen hours are to one day as twenty days ar...,True,False,Who took the time to do this?,,1
9,q273897,273897,__________ taking __________ kids to the bathr...,"They're, their, there","There, they're, their","They're, there, their","Their, they're, there",1


### Question Data

In [78]:
question_data = pd.read_csv('data/question_data.csv', sep=';')
question_data.set_index('Unnamed: 0')
print(question_data.shape)
print(question_data.columns)
display(question_data)

(2620, 10)
Index(['Unnamed: 0', 'text', 'option_1', 'option_2', 'option_3', 'option_4',
       'N', 'Type', 'Order', 'Keywords'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords
0,q2,Breast implants?,more cool than pathetic,more pathetic than cool,,,24839,N,,sex/intimacy; preference; opinion
1,q11,How does the idea of being slapped hard in the...,Horrified,Aroused,Nostalgic,Indifferent,28860,N,,sex/intimacy
2,q12,Divide your age by 2. Have you had sex with a...,Yes,No,,,22496,O,,sex/intimacy
3,q13,Is a girl who's slept with 100 guys a bad person?,Yes,No,,,32581,O,,sex/intimacy
4,q14,Is a guy who's slept with 100 girls a bad person?,Yes,No,,,31127,O,,sex/intimacy
...,...,...,...,...,...,...,...,...,...,...
2615,lf_max_age,Max age of match,,,,,66365,,,
2616,lf_for,Looking for match,,,,,66365,,,
2617,lf_location,Location of match,,,,,66365,,,
2618,lf_min_age,Min age of match,,,,,66365,,,


### Data

In [79]:
data = pd.read_parquet('data/parsed_data_public.parquet', engine='fastparquet')
print(data.shape)
display(data)
# TODO: Frage: Was sagt Unnamed: 0 hier?

(68371, 2626)


Unnamed: 0.1,Unnamed: 0,q2,q11,q12,q13,q14,q16,q17,q18,q20,...,q86615,q86699,q363047,CA,gender_orientation,gender,race,gender2,gender2_num,CA_items
0,1,,Horrified,,,,,No,,,...,,,,0.763080,Hetero_female,Woman,White,Woman,0.0,4
1,2,,,,,,,,,,...,,,,,Hetero_male,Man,,Man,1.0,0
2,3,,,,No,No,,No,,,...,,,,0.661309,Hetero_female,Woman,,Woman,0.0,7
3,4,,,,,,,,,,...,,,,,Hetero_female,Woman,White,Woman,0.0,0
4,5,,,,,,,,,,...,,,,0.875424,Bisexual_female,Woman,,Woman,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68366,68367,,,,,,,,,,...,,,,,Hetero_male,Man,White,Man,1.0,0
68367,68368,,,,,,,,,,...,,,,,Hetero_female,Woman,,Woman,0.0,0
68368,68369,,,,,,,,,,...,,,,,Hetero_male,Man,,Man,1.0,0
68369,68370,,,,,,,,,,...,,,,,Hetero_female,Woman,White,Woman,0.0,0


## Exploration

List of keywords

In [92]:

keys = set(question_data.Keywords)
print(len(keys))
print(keys)


62
{'politics; descriptive; preference', 'sex/intimacy; preference', 'opinion; cognitive', 'opinion', 'politics; cognitive', nan, 'sex/intimacy; preference; descriptive', 'descriptive; technology', 'politics; sex/intimacy; preference', 'politics; opinion; cognitive', 'sex/intimacy; religion/superstition; preference', 'sex/intimacy; BDSM', 'politics; religion/superstition', 'politics; opinion; sex/intimacy', 'sex/intimacy; religion/superstition', 'politics; preference; opinion; sex/intimacy', 'descriptive', 'politics; descriptive', 'descriptive; preference', 'religion/superstition; opinion', 'preference; descriptive; technology', 'preference; descriptive; opinion', 'preference; descriptive; politics', 'politics; opinion', 'politics; preference; opinion', 'preference; technology', 'descriptive; opinion', 'sex/intimacy; preference; opinion', 'religion/superstition', 'religion/superstition; opinion; cognitive', 'sex/intimacy; descriptive; BDSM', 'religion/superstition; preference', 'descri

Add column with number of answers per question

In [85]:
# per question count number of times the question is answered
n_answers_per_question = data.notnull().sum(axis=0)[1:]
question_data = question_data.join(n_answers_per_question.to_frame('n_answers'))

In [86]:
# find political questions
political_questions = question_data[question_data.Keywords.str.contains('politics', na=False)]
print(f'number of questions involving politcs: {political_questions.shape[0]}')
display(political_questions)


number of questions involving politcs: 270


Unnamed: 0_level_0,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords,n_answers
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
q71,Is interracial marriage a bad idea?,Yes,No,,,32492,O,,politics,32492.0
q166,"Politically, which way do you lean?",To the right (republican in the US),To the left (democrat in the US),To the middle / I don't know,,11714,O,"1, 3, 2",politics,11714.0
q167,Do you recycle?,Almost always,Sometimes,Rarely or not at all,,24586,O,,politics,24586.0
q168,The life of one of your fellow citizens is mor...,True,False,,,22138,O,,politics,22138.0
q169,Should the death penalty be abolished?,Yes,No,,,16884,O,,politics,16884.0
...,...,...,...,...,...,...,...,...,...,...
q91207,"Homosexuality is illegal in many countries, wi...",YES- homosexuality should be against the law.,NO- homosexuality should be legal.,I'm Not Sure,,174,O,"2, 3, 1",politics; religion/superstition,174.0
q140080,Is abortion okay if a woman is raped?,Yes,No,I can't decide,,306,O,"2, 3, 1",politics; religion/superstition,306.0
q179268,Are you either vegetarian or vegan?,Yes,No,,,54202,O,,politics; descriptive,54202.0
q212813,Which best describes your political beliefs?,Liberal / Left-wing,Centrist,Conservative / Right-wing,Other,45107,M,[4],politics; descriptive,45107.0


In [89]:
# sorted political questions
sorted_p_questions = political_questions.sort_values(by=['n_answers'], ascending=False)
display(sorted_p_questions.head())

Unnamed: 0_level_0,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords,n_answers
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
q34113,How do you feel about government-subsidized fo...,No problem,"It's okay, if it is not abused",Okay for short amounts of time,Never - Get a job,31769,O,,politics,68371.0
q179268,Are you either vegetarian or vegan?,Yes,No,,,54202,O,,politics; descriptive,54202.0
q403,Do you enjoy discussing politics?,Yes,No,,,52369,O,,politics; preference; descriptive,52369.0
q175,Should burning your country's flag be illegal?,Yes,No,,,45720,O,,politics,45720.0
q212813,Which best describes your political beliefs?,Liberal / Left-wing,Centrist,Conservative / Right-wing,Other,45107,M,[4],politics; descriptive,45107.0
