In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [2]:
root_path = 'data/'

## Load Datasets

### Question Data

In [3]:
question_data = pd.read_csv(os.path.join(root_path, 'question_data.csv'), sep=';')
question_data = question_data.set_index('Unnamed: 0')  # set first column (question number) as index
print('shape', question_data.shape)
print(question_data.columns)
display(question_data.head())
political_belief = 'q212813'
display(question_data.loc[political_belief])

shape (2620, 9)
Index(['text', 'option_1', 'option_2', 'option_3', 'option_4', 'N', 'Type',
       'Order', 'Keywords'],
      dtype='object')


Unnamed: 0_level_0,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
q2,Breast implants?,more cool than pathetic,more pathetic than cool,,,24839,N,,sex/intimacy; preference; opinion
q11,How does the idea of being slapped hard in the...,Horrified,Aroused,Nostalgic,Indifferent,28860,N,,sex/intimacy
q12,Divide your age by 2. Have you had sex with a...,Yes,No,,,22496,O,,sex/intimacy
q13,Is a girl who's slept with 100 guys a bad person?,Yes,No,,,32581,O,,sex/intimacy
q14,Is a guy who's slept with 100 girls a bad person?,Yes,No,,,31127,O,,sex/intimacy


text        Which best describes your political beliefs?
option_1                             Liberal / Left-wing
option_2                                        Centrist
option_3                       Conservative / Right-wing
option_4                                           Other
N                                                  45107
Type                                                   M
Order                                                [4]
Keywords                           politics; descriptive
Name: q212813, dtype: object

### Data

In [4]:
data = pd.read_parquet(os.path.join(root_path, 'parsed_data_public.parquet'))
print('shape', data.shape)
display(data.head())

shape (68371, 2626)


Unnamed: 0.1,Unnamed: 0,q2,q11,q12,q13,q14,q16,q17,q18,q20,...,q86615,q86699,q363047,CA,gender_orientation,gender,race,gender2,gender2_num,CA_items
0,1,,Horrified,,,,,No,,,...,,,,0.76308,Hetero_female,Woman,White,Woman,0.0,4
1,2,,,,,,,,,,...,,,,,Hetero_male,Man,,Man,1.0,0
2,3,,,,No,No,,No,,,...,,,,0.661309,Hetero_female,Woman,,Woman,0.0,7
3,4,,,,,,,,,,...,,,,,Hetero_female,Woman,White,Woman,0.0,0
4,5,,,,,,,,,,...,,,,0.875424,Bisexual_female,Woman,,Woman,0.0,3


In [5]:
## Descriptive questions
descriptive_questions_df = question_data[question_data.Keywords == 'descriptive']
print(f'number of questions with keyword descriptive: {descriptive_questions_df.shape[0]}')
questions_all_descriptive = descriptive_questions_df.index.to_list()
descriptive_set = set(questions_all_descriptive)

## All questions except descritive and political belief
all_questions_answered = {column for column in data.columns if 'q' in column}
not_descriptive = all_questions_answered - descriptive_set - {political_belief}

number of questions with keyword descriptive: 829


In [6]:
## Drop non-descriptive questions from data set
data = data.drop(not_descriptive, axis=1)
## Extract answered political belief
data = data[data[political_belief].notna()]

In [7]:
print(data[questions_all_descriptive].shape)
data.head()

(45107, 829)


Unnamed: 0.1,Unnamed: 0,q49,q50,q60,q61,q63,q67,q68,q69,q76,...,q86397,q86462,q363047,CA,gender_orientation,gender,race,gender2,gender2_num,CA_items
2,3,Carefree,,Warm-hearted,,,,,,,...,,,,0.661309,Hetero_female,Woman,,Woman,0.0,7
4,5,Intense,,,,,,,,,...,,,,0.875424,Bisexual_female,Woman,,Woman,0.0,3
5,6,,,,,,,,,,...,,,,1.515351,Hetero_male,Man,White,Man,1.0,7
6,7,Intense,,,,,,,,Yes,...,,,,0.875424,,Other,White,,,3
7,8,,,,,,,,,,...,,,,-1.586541,Hetero_male,Man,Hispanic / Latin,Man,1.0,1


## Train-Test-Split and save df

In [8]:
df_train, df_test = train_test_split(data, shuffle=True, test_size=0.2, random_state=42)
df_train.to_parquet(os.path.join(root_path, 'train.parquet'))
df_test.to_parquet(os.path.join(root_path, 'test.parquet'))