In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re
from IPython.display import display, HTML
sns.set(context='paper', style='whitegrid', color_codes=True, font_scale=1.8)
colorcycle = [(0.498, 0.788, 0.498),
              (0.745, 0.682, 0.831),
              (0.992, 0.753, 0.525),
              (0.220, 0.424, 0.690),
              (0.749, 0.357, 0.090),
              (1.000, 1.000, 0.600),
              (0.941, 0.008, 0.498),
              (0.400, 0.400, 0.400)]
sns.set_palette(colorcycle)
mpl.rcParams['figure.max_open_warning'] = 65
mpl.rcParams['figure.figsize'] = [12, 7]
mpl.rcParams['text.usetex'] = True

from speclib import misc, plotting, loaders

%matplotlib inline

In [2]:
df = pd.read_json('../../allan_data/RGender_.json')
userAlias = loaders.Useralias()
df.index = df.index.map(lambda s: userAlias.lookup(s))
q = misc.QuestionCompleter(df)
f = misc.QuestionFilterer(df)

df.head()

Unnamed: 0,alcohol_binge10__answer,alcohol_binge10__answer_type,alcohol_binge10__condition,alcohol_binge10__question,alcohol_binge10__response,alcohol_binge5__answer,alcohol_binge5__answer_type,alcohol_binge5__condition,alcohol_binge5__question,alcohol_binge5__response,...,worries_partner__answer,worries_partner__answer_type,worries_partner__condition,worries_partner__question,worries_partner__response,worries_zieblings__answer,worries_zieblings__answer_type,worries_zieblings__condition,worries_zieblings__question,worries_zieblings__response
0010fedde8e61cad5b049da6df8b5a,1.0,radio,alcohol_debut != 'Jeg har aldrig drukket alkohol',Drukket mere end 10 genstande på en dag/aften,1 gange,2.0,radio,alcohol_debut != 'Jeg har aldrig drukket alkohol',Drukket mere end 5 genstande på en dag/aften,2 gange,...,,radio,False,Kæreste/ægtefælle,Har ingen,3.0,radio,False,Søskende,Sjældent
002ee26a0c38a1f77b7cdeab8046cd,1.0,radio,alcohol_debut != 'Jeg har aldrig drukket alkohol',Drukket mere end 10 genstande på en dag/aften,1 gange,0.0,radio,alcohol_debut != 'Jeg har aldrig drukket alkohol',Drukket mere end 5 genstande på en dag/aften,0 gange,...,2.0,radio,False,Kæreste/ægtefælle,Af og til,,radio,False,Søskende,Har ingen
0037c93a19a7e90580086b462ba11b,2.0,radio,alcohol_debut != 'Jeg har aldrig drukket alkohol',Drukket mere end 10 genstande på en dag/aften,2 gange,2.0,radio,alcohol_debut != 'Jeg har aldrig drukket alkohol',Drukket mere end 5 genstande på en dag/aften,2 gange,...,,radio,False,Kæreste/ægtefælle,Har ingen,2.0,radio,False,Søskende,Af og til
005b1e0298bde6b726156aefb8d88c,1.0,radio,alcohol_debut != 'Jeg har aldrig drukket alkohol',Drukket mere end 10 genstande på en dag/aften,1 gange,1.0,radio,alcohol_debut != 'Jeg har aldrig drukket alkohol',Drukket mere end 5 genstande på en dag/aften,1 gange,...,,radio,False,Kæreste/ægtefælle,Har ingen,,radio,False,Søskende,Har ingen
00ad591e46eb6f3d8c474d9de9e219,0.0,radio,alcohol_debut != 'Jeg har aldrig drukket alkohol',Drukket mere end 10 genstande på en dag/aften,0 gange,1.0,radio,alcohol_debut != 'Jeg har aldrig drukket alkohol',Drukket mere end 5 genstande på en dag/aften,1 gange,...,2.0,radio,False,Kæreste/ægtefælle,Af og til,3.0,radio,False,Søskende,Sjældent


## The types of questions

In [3]:
df.filter(like='answer_type').iloc[0].value_counts()

radio           266
number           16
multi_number      2
number;radio      1
scale             1
Name: 0010fedde8e61cad5b049da6df8b5a, dtype: int64

In [4]:
misc.questionSummary(df, q.alcohol_debut, samplesize=10)

Unnamed: 0,response_index,answer_index,count
0,14,0.0,34
1,15,1.0,906


Unnamed: 0,alcohol_debut__answer,alcohol_debut__response
bb3f9dfc64509a9dee49ec4c7e670d,1.0,15
85f2ae5a83dfeb08f17108c6b9d8b2,1.0,15
dd2c2d79f8e0a38073fd286ec6cf0c,1.0,14
725188ac89900e25dffce6454379e5,1.0,15
675f9031919f7cf29cc486c6413582,1.0,14
5fe575cba6e239cf733cd8bd84e32d,1.0,13
0b7ba47fa2c9b9f0120d25c0656d14,1.0,14
7005914c2b3a1945ba461a5053efdf,1.0,16
c57e0f05b263ee8827fc6798662fed,1.0,13
dcead5dae37182784d4c9603a718b0,1.0,14


In [5]:
f.alcohol_debut.head()

Unnamed: 0,alcohol_debut__answer,alcohol_debut__answer_type,alcohol_debut__condition,alcohol_debut__question,alcohol_debut__response
0010fedde8e61cad5b049da6df8b5a,1.0,number;radio,False,Hvor gammel var du første gang du drak mindst ...,15
002ee26a0c38a1f77b7cdeab8046cd,1.0,number;radio,False,Hvor gammel var du første gang du drak mindst ...,14
0037c93a19a7e90580086b462ba11b,1.0,number;radio,False,Hvor gammel var du første gang du drak mindst ...,15
005b1e0298bde6b726156aefb8d88c,1.0,number;radio,False,Hvor gammel var du første gang du drak mindst ...,14
00ad591e46eb6f3d8c474d9de9e219,1.0,number;radio,False,Hvor gammel var du første gang du drak mindst ...,16


In [6]:
df.alcohol_debut__answer.value_counts()

1.0    906
0.0     34
Name: alcohol_debut__answer, dtype: int64

In [7]:
print(*f.alcohol_debut.columns, sep='\n')

alcohol_debut__answer
alcohol_debut__answer_type
alcohol_debut__condition
alcohol_debut__question
alcohol_debut__response


In [8]:
cq = f.alcohol_drunk

In [9]:
cq.groupby(cq.alcohol_drunk__answer)[q.alcohol_drunk__response].value_counts()

alcohol_drunk__answer  alcohol_drunk__response
0.0                    0 gange                    135
1.0                    1 gange                    124
2.0                    2 gange                    155
3.0                    3 gange                    120
4.0                    4-5 gange                  187
5.0                    6-9 gange                  136
6.0                    10+ gange                   49
Name: alcohol_drunk__response, dtype: int64

In [10]:
radio_questions = df.iloc[0].filter(like='answer_type')
radio_questions = radio_questions[radio_questions == 'radio'].index
radio_questions = radio_questions.str.replace('__answer_type', '').tolist()

In [14]:
def response_compare(df, q):
    cq = df.filter(like=q)
    res = cq.groupby(cq[q + '__answer'])[q + '__response'].value_counts()
    res.index.names = [el.split('__')[-1] for el in res.index.names]
    res.name = res.name.split('__')[0]
    return res

In [15]:
summary_list = list()
for cq in radio_questions:
    res = pd.DataFrame(response_compare(df, cq))
    qstr = df.iloc[0].filter(like=cq + '__question')
    summary_list.append((qstr, res))

In [16]:
print(summary_list[0][0].values[0])
summary_list[0][1]

Drukket mere end 10 genstande på en dag/aften


Unnamed: 0_level_0,Unnamed: 1_level_0,alcohol_binge10
answer,response,Unnamed: 2_level_1
0.0,0 gange,348
1.0,1 gange,190
2.0,2 gange,151
3.0,3 gange,89
4.0,4-5 gange,74
5.0,6+ gange,54


In [17]:
misc.questionSummary(df, q.function_duties)

Unnamed: 0,response_index,answer_index,count
0,Slet ikke,0.0,322
1,Lidt,1.0,341
2,Noget,2.0,188
3,Meget,3.0,70
4,Virkelig meget/kan ikke,4.0,12


In [None]:
fails = list()
for qc in (el for el in dir(f) if ('__' not in el and el != 'gender')):
    try:
        misc.questionSummary(df, qc)
    except:
        fails.append(qc)

In [19]:
fails

[]

In [20]:
qstr, res = summary_list[0]

In [68]:
def check_question_scales(res, debug=False):
    resp, ans = res.index.levels
    resp = pd.Series(resp)
    ans_extract = ans.str.extractall(r'(\d+)').astype(float).mean(level=0)
    try:
        ans_extract = ans_extract[0]
    except:
        if debug:
            print(ans_extract)
        return False
    if debug:
        print(resp, ans, ans_extract)
        
    if res.shape != ans_extract.shape:
        return False
    is_sorted = (resp.argsort() == ans_extract.argsort()).all() 
    return is_sorted

In [69]:
check_question_scales(summary_list[2][1], debug=True)

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
Name: answer, dtype: float64 Index(['0 gange', '1 gange', '10+ gange', '2 gange', '3 gange', '4-5 gange',
       '6-9 gange'],
      dtype='object', name='response') 0     0.0
1     1.0
2    10.0
3     2.0
4     3.0
5     4.5
6     7.5
Name: 0, dtype: float64


False