In [182]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [183]:
root_path = 'data/'

## Load Datasets

### Question Data

In [184]:
# data loading
questions = pd.read_csv(os.path.join(root_path, 'question_data.csv'), sep=';')
questions = questions.set_index('Unnamed: 0')  # set first column (question number) as index
questions.index.rename('q_id', inplace=True)

descriptive_questions = questions[questions['Keywords']=='descriptive'].index.to_list()
question_to_id = {question_id: 'dq'+str(i) for i, question_id in enumerate(descriptive_questions)}
descriptive_questions = list(question_to_id.values())
questions.rename(index=question_to_id, inplace=True)
display(questions.loc[descriptive_questions].head())


Unnamed: 0_level_0,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
dq0,Which word describes you better?,Carefree,Intense,,,49827,N,,descriptive
dq1,Have you ever seen a therapist?,Yes,No,,,9507,O,,descriptive
dq2,Which describes you better?,Warm-hearted,Cool-headed,,,21205,O,,descriptive
dq3,Are you a better conversationalist or listener?,Conversationalist,Listener,,,3174,O,,descriptive
dq4,Which word describes you better?,Private,Social,,,23765,O,,descriptive


In [185]:
#add missing entries in 'Order' column
for index, row in questions.loc[descriptive_questions].iterrows():
    if type(row['Order'])!=list and str(row['option_1']) != 'nan':   
        order = [row['option_1'], row['option_2']]
        if isinstance(row['option_3'], str):
            order.append(row['option_3'])
        if isinstance(row['option_4'], str):
            order.append(row['option_4'])
        questions.at[index, 'Order']=order
display(questions.loc[descriptive_questions].head())

Unnamed: 0_level_0,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
dq0,Which word describes you better?,Carefree,Intense,,,49827,N,"[Carefree, Intense]",descriptive
dq1,Have you ever seen a therapist?,Yes,No,,,9507,O,"[Yes, No]",descriptive
dq2,Which describes you better?,Warm-hearted,Cool-headed,,,21205,O,"[Warm-hearted, Cool-headed]",descriptive
dq3,Are you a better conversationalist or listener?,Conversationalist,Listener,,,3174,O,"[Conversationalist, Listener]",descriptive
dq4,Which word describes you better?,Private,Social,,,23765,O,"[Private, Social]",descriptive


In [186]:
questions.to_csv(os.path.join(root_path, 'questions_preprocessed.csv'), sep=';')

### Data

In [187]:
data = pd.read_parquet(os.path.join(root_path, 'parsed_data_public.parquet'))
political_belief = 'q212813'
data.rename({'q212813': 'political_belief', 'Unnamed: 0': 'user_id',**question_to_id}, axis=1, inplace=True)
print('shape', data.shape)
display(data.head())

shape (68371, 2626)


Unnamed: 0,user_id,q2,q11,q12,q13,q14,q16,q17,q18,q20,...,q86615,q86699,dq822,CA,gender_orientation,gender,race,gender2,gender2_num,CA_items
0,1,,Horrified,,,,,No,,,...,,,,0.76308,Hetero_female,Woman,White,Woman,0.0,4
1,2,,,,,,,,,,...,,,,,Hetero_male,Man,,Man,1.0,0
2,3,,,,No,No,,No,,,...,,,,0.661309,Hetero_female,Woman,,Woman,0.0,7
3,4,,,,,,,,,,...,,,,,Hetero_female,Woman,White,Woman,0.0,0
4,5,,,,,,,,,,...,,,,0.875424,Bisexual_female,Woman,,Woman,0.0,3


In [188]:
## All questions except descriptive and political belief
not_descriptive = {column for column in data.columns if column.startswith('q')}
## Drop non-descriptive questions from data set
data = data.drop(not_descriptive, axis=1)
## Extract answered political belief
data = data[data['political_belief'].notna()]

In [189]:
print(data[question_to_id.values()].shape)
data.head()

(45107, 829)


Unnamed: 0,user_id,dq0,dq1,dq2,dq3,dq4,dq5,dq6,dq7,dq828,...,dq809,dq810,dq822,CA,gender_orientation,gender,race,gender2,gender2_num,CA_items
2,3,Carefree,,Warm-hearted,,,,,,,...,,,,0.661309,Hetero_female,Woman,,Woman,0.0,7
4,5,Intense,,,,,,,,,...,,,,0.875424,Bisexual_female,Woman,,Woman,0.0,3
5,6,,,,,,,,,,...,,,,1.515351,Hetero_male,Man,White,Man,1.0,7
6,7,Intense,,,,,,,,Yes,...,,,,0.875424,,Other,White,,,3
7,8,,,,,,,,,,...,,,,-1.586541,Hetero_male,Man,Hispanic / Latin,Man,1.0,1


## Train-Test-Split and save df

In [190]:
df_train, df_test = train_test_split(data, shuffle=True, test_size=0.1, random_state=42)
df_train.to_parquet(os.path.join(root_path, 'train.parquet'))
df_test.to_parquet(os.path.join(root_path, 'test.parquet'))