In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [None]:
root_path = 'data/'

## Load Datasets

### Question Data

In [None]:
# data loading
questions = pd.read_csv(os.path.join(root_path, 'question_data.csv'), sep=';')
questions = questions.set_index('Unnamed: 0')  # set first column (question number) as index
questions.index.rename('q_id', inplace=True)

descriptive_questions = questions[questions['Keywords']=='descriptive'].index.to_list()
question_to_id = {question_id: 'dq'+str(i) for i, question_id in enumerate(descriptive_questions)}
descriptive_questions = list(question_to_id.values())
questions.rename(index=question_to_id, inplace=True)
display(questions.loc[descriptive_questions].head())


In [None]:
#add missing entries in 'Order' column
for index, row in questions.loc[descriptive_questions].iterrows():
    if type(row['Order'])!=list and str(row['option_1']) != 'nan':   
        order = [row['option_1'], row['option_2']]
        if isinstance(row['option_3'], str):
            order.append(row['option_3'])
        if isinstance(row['option_4'], str):
            order.append(row['option_4'])
        questions.at[index, 'Order']=order
display(questions.loc[descriptive_questions].head())

In [None]:
questions.to_csv(os.path.join(root_path, 'questions_preprocessed.csv'), sep=';')

### Data

In [None]:
data = pd.read_parquet(os.path.join(root_path, 'parsed_data_public.parquet'))
political_belief = 'q212813'
data.rename({'q212813': 'political_belief', 'Unnamed: 0': 'user_id',**question_to_id}, axis=1, inplace=True)
print('shape', data.shape)
display(data.head())

In [None]:
## All questions except descriptive and political belief
not_descriptive = {column for column in data.columns if column.startswith('q')}
## Drop non-descriptive questions from data set
data = data.drop(not_descriptive, axis=1)
## Extract answered political belief
data = data[data['political_belief'].notna()]

In [None]:
print(data[question_to_id.values()].shape)
data.head()

## Train-Test-Split and save df

In [None]:
df_train, df_test = train_test_split(data, shuffle=True, test_size=0.1, random_state=42)
df_train.to_parquet(os.path.join(root_path, 'train.parquet'))
df_test.to_parquet(os.path.join(root_path, 'test.parquet'))