In [1]:
import sys

sys.path.insert(0, '../utils/')
import numpy as np
import pandas as pd
from utils_clustering import back_from_dummies, prettify_feature_names, tag_immigration

# Preparation

Here we performed the initial steps to prepare data particularly for unsupervised machine learning exploration in order to cluster people to groups depending on their opinions about political questions

In [2]:
# load all data
waves = ['1', '2', '3', '4', '5', '6']
df = dict()
for wave in waves:
    df[wave] = pd.read_csv(f'../../../data/processed/data_online_political_w{wave}.csv')
    # make ID the index so people can be identified uniquely across waves
    df[wave].index = df[wave]['id']

## Check question

First, we begin by removing the participants who answered the *check questions* incorrectly. For simplicity we exclude those people completely (meaning for all waves) even if they did answer correctly at a later wave.

The two *check questions* are `w2_q24x5` and `w1_q27x5`, meaning that such questions were only asked in wave $1$ and $2$.

In [3]:
check_question_correct = {}
for wave in ['1', '2']:
    check_question_correct[wave] = df[wave]['check_question'] == 1
    df[wave] = df[wave].loc[check_question_correct[wave], :]
    df[wave] = df[wave].drop('check_question', axis='columns')

correct_w1 = check_question_correct['1']
correct_w2 = check_question_correct['2']

In [4]:
print(f'The number of people who answered one of the check questions incorrectly \
is {np.sum(~correct_w1)+np.sum(~correct_w2)}.')

The number of people who answered one of the check questions incorrectly is 1207.


To be precise we are counting participants who answered both questions incorrectly twice, so the number might be a bit inflated.

## Remove no shows

If people participated only in a single wave we cannot study their opinion over time so we remove them right away. 

In [5]:
for wave in waves:
    df[wave] = df[wave].loc[df[wave]['participated_only_once'] != 1, :]

## Opinion questions *only*

In [6]:
'''
def get_more_questions(df):
    """add questions like self-placement or whom one would vote for, thus also opinions, 
    but not that explicit, including other options to respond"""
    markers_ = ['TRUST: THE', #e.g. government, police, EU 
           'PTV', 
           'LIKE-DISLIKE',
           'ASSESSMENT OF POLIT. SITUATION',
           'ASSESSMENT - COMPETENCE', 
           'ASSESSMENT - CHARISMA', 
           'PREFERRED COALITION',
           #'DAYS PER WEEK', #e.g. social media, papers - tends to be the most important ones, but groups are unstable
           'ACTIVITY SOCIAL NETWORKS',
           'ASSESSMENT GOOD/BAD JOB',
           'INFORMATION ON POLITICAL EVENTS',
           'PARTY ASSOCIATED WITH',
           'POLITICAL PARTICIPATION',
           'MOST COMPETENT PARTY',
           'IMPORTANT ISSUE',
           'TOO MANY STATE BENEFITS',
           'INTEREST IN NEWS',
           'GOVERNMENT FORMATION',
           'CAMPAIGN: ATTACKED OPPONENTS THE MOST']
    new_questions = [question for question in df.columns if any(marker in question for marker in markers_)]
    for question in new_questions:
        df = df.rename(columns={question: f'new: {question}'})
    return df
'''

'\ndef get_more_questions(df):\n    """add questions like self-placement or whom one would vote for, thus also opinions, \n    but not that explicit, including other options to respond"""\n    markers_ = [\'TRUST: THE\', #e.g. government, police, EU \n           \'PTV\', \n           \'LIKE-DISLIKE\',\n           \'ASSESSMENT OF POLIT. SITUATION\',\n           \'ASSESSMENT - COMPETENCE\', \n           \'ASSESSMENT - CHARISMA\', \n           \'PREFERRED COALITION\',\n           #\'DAYS PER WEEK\', #e.g. social media, papers - tends to be the most important ones, but groups are unstable\n           \'ACTIVITY SOCIAL NETWORKS\',\n           \'ASSESSMENT GOOD/BAD JOB\',\n           \'INFORMATION ON POLITICAL EVENTS\',\n           \'PARTY ASSOCIATED WITH\',\n           \'POLITICAL PARTICIPATION\',\n           \'MOST COMPETENT PARTY\',\n           \'IMPORTANT ISSUE\',\n           \'TOO MANY STATE BENEFITS\',\n           \'INTEREST IN NEWS\',\n           \'GOVERNMENT FORMATION\',\n           

In [7]:
df_opinion = {}
party_choice = {}
left_right = {}
for wave in waves:
    df_opinion[wave] = df[wave].filter(like='OPINION: ', axis=1)
    df_opinion[wave] = prettify_feature_names(df_opinion[wave])
    df_opinion[wave] = tag_immigration(df_opinion[wave])
    party_choice[wave] = df[wave].filter(like='PARTY CHOICE: PROSPECTIVE', axis=1)
    # continious scale of self right-left placement (for waves 2-4 there is no such a question/most values are NaN
    # so we use altertative question: PREFERRED COALITION: OEVP-FPOE
    if wave in {'2', '3', '4'}:
        left_right[wave] = df[wave].filter(like='PREFERRED COALITION: OEVP-FPOE', axis=1)
        left_right[wave].columns = ['Preferred coalition OEVP-FPOE']
    else:
        left_right[wave] = df[wave].filter(like='LEFT-RIGHT SELF-PLACEMENT -w', axis=1)
        left_right[wave].columns = ['Left-right self-placement']
        
    if wave in {'1', '2', '3', '4'}:
        party_choice[wave] = df[wave].filter(like='PARTY CHOICE: PROSPECTIVE', axis=1)
        party_choice[wave] = back_from_dummies(party_choice[wave])
        party_choice[wave].columns = ['PARTY CHOICE: PROSPECTIVE']
        party_choice[wave]['PARTY CHOICE: PROSPECTIVE'] = pd.to_numeric(party_choice[wave]['PARTY CHOICE: PROSPECTIVE'])
        party_choice[wave].replace({1: 'SPOE', 2: "List Sebastian Kurz: OEVP",
                                                  3: 'FPOE', 4: 'The Greens', 5: 'NEOS',
                                                  6: 'Team Stronach', 11: 'other party',
                                                  12: "will vote invalid", 88: "don't know", 99: "refused"}, inplace=True)