In [None]:
import pandas as pd

In [None]:
!pip install pyreadstat

## Util

In [None]:
import numpy as np
import os

basedir = '../../data/eurobarometer/'
in_dir = basedir + "raw_data/"
step1_dir = basedir + "step1/"
os.makedirs(step1_dir, exist_ok=True)

def open_survey(survey):
    path = f'{in_dir}{survey}.sav'
    df = pd.read_spss(path)
    return df

def rename(df, frm, to, ls, key, prefix='v'):
    keys = [f'{prefix}{i}' for i in range(frm, to + 1)]
    df_l = df[keys].apply(lambda x: x.str.lower())\
        .replace('mentioned', 1)\
        .replace({'not mentioned': 0, 'no second language': 0, 'no third language': 0})\
        .fillna(0).astype(int)
    new_keys = [f'{key}_{l}' for l in ls]
    df_l = df_l.rename(dict(zip(keys, new_keys)), axis=1)
    return df_l

def piv(df, nq, prefix="L2", prefix2="v"):
    key = f'{prefix2}{nq}'
    df[key] = df[key].str.capitalize()
    d = prefix + '_' + df[key].astype(str).to_frame().replace({'None': 'DK'}).fillna('DK')
    d['cnt'] = 1
    d = d.pivot(columns=key).fillna(0).astype(int)
    d.columns = d.columns.droplevel()
    return d

def concat_and_save(nat, L1, L2, survey):
    res = pd.concat([nat, L1, L2], axis=1)
    res = res.drop('L2_nan', axis=1, errors='ignore')
    res['Year'] = year
    res.to_csv(f'{step1_dir}{survey}.csv', index=False)
    return res

def get_langs(filename):
    with open(f'{in_dir}{filename}') as fp:
        return fp.read().split('\n')

## Step 1 - Clean up dfs

In [None]:
survey = 'EB771_2012_ZA5597_v3-0-0'
year = 2012
df = open_survey(survey)
ls = list(map(lambda x: x.title(), get_langs('languages_771.txt')))
L1 = rename(df, 1, 39, ls, "L1", prefix='d48a_')
L2 = pd.DataFrame(index=L1.index, columns=[f"L2_{l}" for l in ls]).fillna(0)
for nq in ['b', 'c', 'd']:
    d = piv(df, nq, prefix2='d48')
    L2 = L2 + d
nat = df['isocntry']
res = concat_and_save(nat, L1, L2, survey)  
res.groupby('isocntry').sum()

In [None]:
survey = 'EB634_2005_ZA4411_v1-1-0'
year = 2005
df = open_survey(survey)
ls = get_langs('languages_634.txt')

L1 = rename(df, 442, 476, ls, "L1")
L2 = rename(df, 480, 514, ls, "L2")
for nq in range(477, 480):
    d = piv(df, nq)
    L2 = L2 + d
    
nat = df['v7'].rename('isocntry', axis=1)
res = concat_and_save(nat, L1, L2, survey)
res.groupby('isocntry').sum()

In [None]:
survey = 'EB643_2005_ZA4415_v1-0-1'
year = 2005
df = open_survey(survey)
ls = get_langs('languages_643.txt')

L1 = rename(df, 264, 303, ls, "L1")
L2 = pd.DataFrame(index=L1.index, columns=[f"L2_{l}" for l in ls]).fillna(0)
for nq in range(304, 307):
    d = piv(df, nq)
    L2 = L2 + d

nat = df['v7'].rename('isocntry', axis=1)
res = concat_and_save(nat, L1, L2, survey)  
res.groupby('isocntry').sum()

In [None]:
survey = 'EB551_2001_ZA3507_v1-0-1'
year = 2001
df = open_survey(survey)
ls = get_langs('languages_551.txt')

L2 = rename(df, 39, 53, ls, 'L2')
L1 = piv(df, 38, "L1")

nat = df['isocntry']
res = concat_and_save(nat, L1, L2, survey)  
res.groupby('isocntry').sum()

In [None]:
survey = 'EB520_1999_ZA3204_v1-0-1'
year = 1999
df = open_survey(survey)
ls = get_langs('languages_520.txt')

L2 = rename(df, 49, 63, ls, 'L2')
L1 = piv(df, 38, "L1")

nat = df['isocntry']
res = concat_and_save(nat, L1, L2, survey)  
res.groupby('isocntry').sum()

In [None]:
survey = 'EB440_1995_ZA2689_v1-0-1'
year = 1995
df = open_survey(survey)
ls = get_langs('languages_440.txt')

L1 = rename(df, 264, 280, ls, 'L1')
L2 = rename(df, 281, 297, ls, 'L2')

nat = df['isocntry']
res = concat_and_save(nat, L1, L2, survey)  
res.groupby('isocntry').sum()

In [None]:
survey = 'EB340_1990_ZA1960_v1-0-1'
year = 1990
df = open_survey(survey)
ls = get_langs('languages_340.txt')

L1 = rename(df, 206, 217, ls, 'L1')
L2 = rename(df, 194, 205, ls, 'L2')

nat = df['isocntry']
res = concat_and_save(nat, L1, L2, survey)  
res.groupby('isocntry').sum()

In [None]:
survey = 'EB28_1987_ZA1713_v1-1-0'
year = 1987
df = open_survey(survey)
ls = get_langs('languages_28.txt')

L1 = rename(df, 97, 106, ls, 'L1')
L2 = rename(df, 87, 96, ls, 'L2')

nat = df['isocntry']
res = concat_and_save(nat, L1, L2, survey)  
res.groupby('isocntry').sum()

In [None]:
survey = 'EB500_1998_ZA3085_v1-1-0'
year = 1998
df = open_survey(survey)
ls = get_langs('languages_500.txt')

L2 = rename(df, 39, 55, ls, 'L2')
L1 = piv(df, 38, 'L1')

nat = df['isocntry']
res = concat_and_save(nat, L1, L2, survey)  
res.groupby('isocntry').sum()

## Eval

In [None]:
def combine(df, col1, col2):
    df[col1] = df[col1] + df[col2]
    df = df.drop(col2, axis=1)
    return df

In [None]:
import glob

step2_dir = basedir + "step2/"
os.makedirs(step2_dir, exist_ok=True)

dfs = []
for filepath in glob.glob(step1_dir + "*"):
    dfs.append(pd.read_csv(filepath))
    
eval_df = pd.concat(dfs, ignore_index=True)
eval_df = eval_df.loc[:, eval_df.sum() != 0]

repll = [('L2_DK', 'L2_DK '), ('L2_DK', 'L2_None'), ('L1_DK', 'L1_nan'), ('L1_Other', 'L1_Other (specify)'), 
        ('L1_Portuguese', 'L1_Portug'), ('L2_Portuguese', 'L2_Portug'), ('L1_Luxembourgish', 'L1_Luxembrgsh')]
for common, other in repll:
    eval_df = combine(eval_df, common, other)

eval_df = eval_df.rename({'isocntry': 'ISO_Country'}, axis=1)

eval_df.to_csv(step2_dir + 'eval_df_wide.csv')
eval_df.groupby(['Year', 'ISO_Country']).sum()

In [None]:
import re

eval_df.columns = list(map(lambda x: re.sub(r'(L.*)(_)(.*)', r'\3\2\1', x), eval_df.columns))
ls = set([n.split('_')[0] for n in eval_df.columns if n.endswith('_L1') or n.endswith('_L2')])
long = eval_df.reset_index().rename({'index': 'id'}, axis=1)
long = pd.wide_to_long(long, stubnames=ls, i='id',j='Proficiency', sep='_', suffix='.*').fillna(0)
long.to_csv(step2_dir +'eval_df_long.csv')
long.groupby(['Year', 'ISO_Country', 'Proficiency']).sum()