In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
from pprint import pp
import re
import json

In [None]:
data = pd.read_csv('../data/data.csv', sep=r'\t')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
# profile = ProfileReport(data, title="Pandas Profiling Report", explorative=True)
# profile.to_file("your_report.html")

In [None]:
data.columns[data.isna().any()].tolist()

In [None]:
# fill those nan values with "unknown"
data[['country', 'major']] = data[['country', 'major']].fillna('unknown')

In [None]:
print(data.columns.to_list())

### Make Dictionary from Codebook and save to json

In [None]:
with open('../data/codebook.txt', 'r') as f:
    codebook = f.read()
    lines = codebook.split('\n')

In [None]:
q_dict = {'questions': {}, 'answers': {}, 'TIPI_types': {}, 
          'TIPI_responses': {}, 'VCL': {}, 'other_questions': {}, 'other_answers': {}}
for line in lines:
    # get questions
    if re.search('^Q\d{1,2}\s', line):   
        QnA = line.split('\t', 1)
        q_dict['questions'][QnA[0]] = QnA[1].strip()
        
    # get answers
    if re.search('^\d = .*[aA]ppl.*', line):
        ans = line.split(' = ')
        q_dict['answers'][ans[0]] = ans[1].strip()
        
    # get TIPI (Ten Item Personality Inventory) types
    if re.search('^TIPI\d{1,2}\s', line):
        tipi = line.split('\t', 1)
        q_dict['TIPI_types'][tipi[0]] = tipi[1].strip()
        
    # get the TIPI responses
    if re.search('^\d = .*[aA]gree.*', line): 
        tipi = line.split(' = ')
        q_dict['TIPI_responses'][tipi[0]] = tipi[1].strip()
        
    # get VCL (Vocabulary) codes
    if re.search('^VCL', line):
        vcl = line.split('\t')
        q_dict['VCL'][vcl[0]] = vcl[1].strip()
    # get other questions
    if re.search('.*\s\".*\",\s\d', line):
        other = line.split('\t')
        q = line.split(r'"')[1].strip()
        q_dict['other_questions'][other[0]] = q
    # get other answers
        responses = line.split(r'"')[2].split(', ')[1:]
        q_dict['other_answers'][other[0]] = {resp.split('=')[0]: resp.split('=')[1].strip() 
                                             for resp in responses if '=' in resp}

            
    

In [None]:
with open('../data/codebook_dict.json', 'w') as f:
    json.dump(q_dict, f)

DASS results

Calculate DASS score and categories according to:
https://www.psytoolkit.org/survey-library/depression-anxiety-stress-dass.html

Severities of depression, anxiety and stress are categorized to:

    0 - Normal
    1 - Mild
    2 - Moderate
    3 - Severe
    4 - Extremely severe



In [None]:
df = data.copy()

DASS_keys = {'Depression': [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42],
             'Anxiety': [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41],
             'Stress': [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39]}

DASS_bins = {'Depression': [(0, 10), (10, 14), (14, 21), (21, 28)],
             'Anxiety': [(0, 8), (8, 10), (10, 15), (15, 20)],
             'Stress': [(0, 15), (15, 19), (19, 26), (26, 34)]}
             

for name, keys in DASS_keys.items():
    # Subtract one to match definition of DASS score in source
    df[name] = (df.filter(regex='Q(%s)A' % '|'.join(map(str, keys))) - 1).sum(axis=1)
    
    bins = DASS_bins[name]
    bins.append( (DASS_bins[name][-1][-1], df[name].max() + 1) )
    bins = pd.IntervalIndex.from_tuples(bins, closed='left')
    df[name + '_cat'] = np.arange(len(bins))[pd.cut(df[name], bins=bins).cat.codes]
    
dass = df[DASS_keys.keys()]
dass_cat = df[[k + '_cat' for k in DASS_keys.keys()]]



In [None]:
# Calculation of the big five personality attributes according to:
# http://gosling.psy.utexas.edu/wp-content/uploads/2014/09/JRP-03-tipi.pdfc

# Add personality types to data
personality_types = ['Extraversion', 'Agreeableness', 'Conscientiousness', 'EmotionalStability', 'Openness']

# Invert some entries
tipi = df.filter(regex='TIPI\d+').copy()
tipi_inv = tipi.filter(regex='TIPI(2|4|6|8|10)').apply(lambda d: 7 - d)
tipi[tipi.columns.intersection(tipi_inv.columns)] = tipi_inv

# Calculate scores
for idx, pt in enumerate( personality_types ):
    df[pt] = tipi[['TIPI{}'.format(idx + 1), 'TIPI{}'.format(6 + idx)]].mean(axis=1)

personalities = df[personality_types]

In [None]:
# There are no nan values
df.isna().sum().sum()

In [None]:
CMAP = 'Oranges'
df.describe().T.style.background_gradient(cmap = CMAP)

In [None]:
df

In [None]:
df.to_csv('../data/data_labeled.csv')