Internet use and religion in Europe
-----------------------------------------

This notebook presents a quick-and-dirty analysis of the association between Internet use and religion in Europe, using data from the European Social Survey (http://www.europeansocialsurvey.org).

Copyright 2015 Allen Downey

MIT License: http://opensource.org/licenses/MIT

In [38]:
from __future__ import print_function, division

import numpy as np
import pandas as pd

import thinkstats2
import thinkplot

import statsmodels.formula.api as smf

%matplotlib inline

The following function selects the columns I need.

In [75]:
def read_cycle(filename):
    df = pd.read_stata(filename, convert_categoricals=False)

    if 'hinctnta' not in df.columns:
        df['hinctnta'] = df.hinctnt
        
    cols = ['cntry', 'tvtot', 'tvpol', 'rdtot', 'rdpol', 'nwsptot', 'nwsppol', 'netuse', 
            'rlgblg', 'rlgdgr', 'eduyrs', 'hinctnta', 'yrbrn', 'eisced', 'pspwght', 'pweight']
    df = df[cols]
    return df

Read data from Cycle 1.  TODO: investigate the difference between hinctnt and hinctnta; is there a recode that reconciles them?

In [40]:
df1 = read_cycle('ESS1e06_4.dta')
df1.head()

Unnamed: 0,cntry,tvtot,tvpol,rdtot,rdpol,nwsptot,nwsppol,netuse,rlgblg,rlgdgr,eduyrs,hinctnta,yrbrn,eisced,pspwght,pweight
0,AT,1,1,1,1,1,1,5,1,8,11,77,1949,0,0.940933,0.271488
1,AT,3,2,4,1,2,2,6,2,5,14,2,1953,0,0.470466,0.271488
2,AT,7,3,0,66,0,66,0,1,7,9,77,1940,0,1.392155,0.271488
3,AT,1,1,1,1,2,2,4,1,7,18,9,1959,0,1.382163,0.271488
4,AT,0,66,1,1,0,66,7,1,10,15,9,1962,0,1.437766,0.271488


Read data from Cycle 2.

In [41]:
df2 = read_cycle('ESS2e03_4.dta')
df2.head()

Unnamed: 0,cntry,tvtot,tvpol,rdtot,rdpol,nwsptot,nwsppol,netuse,rlgblg,rlgdgr,eduyrs,hinctnta,yrbrn,eisced,pspwght,pweight
0,AT,3,2,0,66,4,2,7,1,7,12,8,1971,0,0.682185,0.302006
1,AT,7,2,3,1,5,2,0,1,7,8,4,1925,0,0.565038,0.302006
2,AT,6,2,1,0,1,0,6,2,4,13,6,1977,0,0.341133,0.302006
3,AT,3,1,2,1,2,1,7,1,5,8,9,1989,0,0.80405,0.302006
4,AT,2,1,1,1,1,1,4,2,1,11,88,1988,0,1.125251,0.302006


Read data from Cycle 3.

In [42]:
df3 = read_cycle('ESS3e03_5.dta')
df3.head()

Unnamed: 0,cntry,tvtot,tvpol,rdtot,rdpol,nwsptot,nwsppol,netuse,rlgblg,rlgdgr,eduyrs,hinctnta,yrbrn,eisced,pspwght,pweight
0,AT,5,1,7,2,0,66,6,1,3,11,7,1980,0,0.949578,0.289116
1,AT,3,1,2,1,2,1,7,1,9,20,5,1974,0,1.41218,0.289116
2,AT,1,1,7,2,2,1,7,1,6,16,77,1954,0,0.723276,0.289116
3,AT,4,1,7,2,3,2,6,1,5,12,88,1967,0,0.625744,0.289116
4,AT,6,2,6,2,3,1,0,1,7,11,88,1971,0,0.417162,0.289116


Read data from Cycle 4.

In [43]:
df4 = read_cycle('ESS4e04_3.dta')
df4.head()

Unnamed: 0,cntry,tvtot,tvpol,rdtot,rdpol,nwsptot,nwsppol,netuse,rlgblg,rlgdgr,eduyrs,hinctnta,yrbrn,eisced,pspwght,pweight
0,BE,7,1,0,66,0,66,0,2,1,18,4,1972,6,0.823223,0.503773
1,BE,7,2,7,3,0,66,0,2,0,15,7,1982,6,0.79861,0.503773
2,BE,2,2,3,3,3,3,7,1,6,18,10,1940,7,0.77802,0.503773
3,BE,7,2,2,2,2,2,0,1,6,15,7,1931,6,0.777735,0.503773
4,BE,0,66,3,0,1,1,7,2,0,13,7,1982,4,0.96096,0.503773


Read data from Cycle 5.

In [44]:
df5 = read_cycle('ESS5e03_2.dta')
df5.head()

Unnamed: 0,cntry,tvtot,tvpol,rdtot,rdpol,nwsptot,nwsppol,netuse,rlgblg,rlgdgr,eduyrs,hinctnta,yrbrn,eisced,pspwght,pweight
0,BE,5,1,1,0,1,1,2,1,5,15,88,1988,4,0.792865,0.528619
1,BE,4,3,2,2,1,2,6,2,7,15,5,1967,6,0.871107,0.528619
2,BE,4,0,0,66,0,66,7,2,0,13,1,1991,3,0.799453,0.528619
3,BE,2,2,7,2,0,66,7,1,5,15,10,1987,6,0.81603,0.528619
4,BE,7,3,7,4,0,66,0,2,5,15,6,1952,6,0.764902,0.528619


In [86]:
def clean_cycle(df):
    df.tvtot.replace([77, 88, 99], np.nan, inplace=True)
    df.rdtot.replace([77, 88, 99], np.nan, inplace=True)
    df.nwsptot.replace([77, 88, 99], np.nan, inplace=True)
    df.netuse.replace([77, 88, 99], np.nan, inplace=True)
    df.tvpol.replace([66, 77, 88, 99], np.nan, inplace=True)
    df.rdpol.replace([66, 77, 88, 99], np.nan, inplace=True)
    df.nwsppol.replace([66, 77, 88, 99], np.nan, inplace=True)
    df.eduyrs.replace([77, 88, 99], np.nan, inplace=True)
    df.rlgblg.replace([7, 8, 9], np.nan, inplace=True)
    df.rlgdgr.replace([77, 88, 99], np.nan, inplace=True)
    df.hinctnta.replace([77, 88, 99], np.nan, inplace=True)
    df.yrbrn.replace([7777, 8888, 9999], np.nan, inplace=True)
    
    df['hasrelig'] = (df.rlgblg==1).astype(int)
    df.loc[df.rlgblg.isnull(), 'hasrelig'] = np.nan
    
    df['yrbrn60'] = df.yrbrn - 1960

In [87]:
cycles = [df1, df2, df3, df4, df5]
for cycle in cycles:
    clean_cycle(cycle)

In [88]:
def resample(df):
    res = {}
    grouped = df.groupby('cntry')
    for name, group in grouped:
        sample = group.sample(len(group), weights=group.pspwght, replace=True)
        sample.index = range(len(group))
        res[name] = sample
    return res

# each cycle_map is a map from country code to DataFrame
cycle_maps = [resample(cycle) for cycle in cycles]
for cycle_map in cycle_maps:
    print(len(cycle_map), 'countries')

22 countries
25 countries
23 countries
29 countries
27 countries


TODO: Find out which questons were omitted in which countries during which cycles.

In [89]:
def replace_var_with_rank(name, df, old, new):
    series = df[old] + np.random.uniform(-0.25, 0.25, len(df))
    if len(series.dropna()) < 10:
        df[new] = np.random.random(len(df))
        return
    
    cdf = thinkstats2.Cdf(series)
    df[new] = cdf.Probs(series)
    df.loc[df[old].isnull(), new] = np.nan
    
def replace_with_ranks(cycle_map):
    for name, group in cycle_map.items():
        replace_var_with_rank(name, group, 'hinctnta', 'hincrank')
        replace_var_with_rank(name, group, 'eduyrs', 'edurank')
        
for cycle_map in cycle_maps:
    replace_with_ranks(cycle_map)

In [96]:
def fill_var(df, old, new):
    null = df[df[old].isnull()]
    if len(null) == 0:
        return

    print(len(df[old].dropna()))
    fill = df[old].dropna().sample(len(null), replace=True)
    fill.index = null.index
    df[new] = df[old].fillna(fill)
    
def fill_all_vars(df):
    for old in ['hasrelig', 'yrbrn60', 'edurank', 'hincrank',
           'tvtot', 'rdtot', 'nwsptot', 'netuse']:
        new = old + '_f'
        print(old)
        fill_var(df, old, new)
        #print(old, sum(df[old].isnull()), sum(df[new].isnull()))

In [97]:
def fill_vars_by_country(cycle_map):
    for name, group in cycle_map.items():
        print(name)
        fill_all_vars(group)
        
for cycle_map in cycle_maps:
    fill_vars_by_country(cycle_map)

BE
hasrelig
1894
yrbrn60
1817
edurank
1863
hincrank
1471
tvtot
1891
rdtot
1898
nwsptot
1894
netuse
1897
FR
hasrelig
1500
yrbrn60
edurank
1474
hincrank
tvtot
rdtot
1500
nwsptot
netuse
0


ValueError: a must be greater than 0

In [49]:
def concat_groups(cycle_map):
    return pd.concat(cycle_map.values(), ignore_index=True)

dfs = [concat_groups(cycle_map) for cycle_map in cycle_maps]

for df in dfs:
    print(len(df))

42359
47537
43000
56752
52458


Concatenate the cycles.

In [50]:
df = pd.concat(dfs, ignore_index=True)
print(df.shape)
df.head()

(242106, 18)


Unnamed: 0,cntry,tvtot,tvpol,rdtot,rdpol,nwsptot,nwsppol,netuse,rlgblg,rlgdgr,eduyrs,hinctnta,yrbrn,eisced,pspwght,pweight,hincrank,edurank
0,BE,1,0,1,0.0,0,,1,2,7,9,,1960,4,0.792153,0.44784,,0.304604
1,BE,2,2,0,,2,1.0,4,1,8,25,,1962,7,0.804951,0.44784,,0.998394
2,BE,7,3,1,0.0,2,1.0,7,1,3,15,,1940,6,0.67132,0.44784,,0.806745
3,BE,5,2,7,2.0,0,,0,1,6,10,10.0,1940,2,1.256671,0.44784,0.959789,0.388651
4,BE,2,1,3,0.0,2,1.0,7,1,7,13,,1975,4,0.680608,0.44784,,0.708244


TV watching time on average weekday

In [51]:
df.tvtot.value_counts().sort_index()

0     8329
1    12948
2    33728
3    33649
4    40578
5    30858
6    29257
7    52121
Name: tvtot, dtype: int64

Radio listening, total time on average weekday.

In [52]:
df.rdtot.value_counts().sort_index()

0    58718
1    37546
2    38382
3    18915
4    15975
5    10741
6     9761
7    50845
Name: rdtot, dtype: int64

Newspaper reading, total time on average weekday.

In [53]:
df.nwsptot.value_counts().sort_index()

0    70316
1    75311
2    64339
3    18082
4     7066
5     2760
6     1331
7     1977
Name: nwsptot, dtype: int64

Personal use of Internet, email, www

In [54]:
df.netuse.value_counts().sort_index()

0    72058
1    37586
2     4838
3     3791
4     8304
5     9756
6    28778
7    70066
Name: netuse, dtype: int64

Belong to a particular religion or denomination

In [55]:
df.rlgblg.value_counts().sort_index()

1    152140
2     86021
Name: rlgblg, dtype: int64

How religious

In [56]:
df.rlgdgr.value_counts().sort_index()

0     30515
1     13736
2     16811
3     19515
4     15944
5     41880
6     24064
7     28006
8     24187
9     10834
10    14289
Name: rlgdgr, dtype: int64

Total household net income, all sources

In [57]:
df.hincrank.describe()

count    191125.000000
mean          0.500547
std           0.288699
min           0.000001
25%           0.250585
50%           0.500728
75%           0.750609
max           1.000000
Name: hincrank, dtype: float64

Year born

In [58]:
df.yrbrn.describe()

count    240983.000000
mean       1961.289141
std          18.689520
min        1885.000000
25%        1947.000000
50%        1962.000000
75%        1977.000000
max        1996.000000
Name: yrbrn, dtype: float64

Shifted to mean near 0

In [59]:
df.yrbrn60.describe()

count    240983.000000
mean          1.289141
std          18.689520
min         -75.000000
25%         -13.000000
50%           2.000000
75%          17.000000
max          36.000000
Name: yrbrn60, dtype: float64

Number of years of education

In [60]:
df.edurank.describe()

count    239086.000000
mean          0.500264
std           0.288676
min           0.000332
25%           0.250259
50%           0.500276
75%           0.750257
max           1.000000
Name: edurank, dtype: float64

Country codes

In [61]:
df.cntry.value_counts().sort_index()

AT     6918
BE     8939
BG     6064
CH     9310
CY     3293
CZ     8790
DE    14487
DK     7684
EE     6960
ES     9729
FI     9991
FR     9096
GB    11117
GR     9759
HR     3133
HU     7806
IE    10472
IL     7283
IS      579
IT     1207
LT     1677
LU     3187
LV     1980
NL     9741
NO     8643
PL     8917
PT    10302
RO     2146
RU     7544
SE     9201
SI     7126
SK     6944
TR     4272
UA     7809
Name: cntry, dtype: int64

Make a binary dependent variable

Run the model

In [63]:
def run_model(df, formula):
    model = smf.logit(formula, data=df)    
    results = model.fit(disp=False)
    return results

Here's the model with all control variables and all media variables:

In [64]:
formula = ('hasrelig ~ yrbrn60 + edurank + hincrank +'
           'tvtot + rdtot + nwsptot + netuse')
res = run_model(df, formula)
res.summary()

0,1,2,3
Dep. Variable:,hasrelig,No. Observations:,179146.0
Model:,Logit,Df Residuals:,179138.0
Method:,MLE,Df Model:,7.0
Date:,"Fri, 30 Oct 2015",Pseudo R-squ.:,0.02808
Time:,16:29:23,Log-Likelihood:,-114930.0
converged:,True,LL-Null:,-118250.0
,,LLR p-value:,0.0

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,1.0036,0.019,52.848,0.000,0.966 1.041
yrbrn60,-0.0074,0.000,-22.938,0.000,-0.008 -0.007
edurank,-0.0245,0.020,-1.247,0.212,-0.063 0.014
hincrank,0.1416,0.019,7.641,0.000,0.105 0.178
tvtot,-0.0107,0.003,-4.230,0.000,-0.016 -0.006
rdtot,-0.0122,0.002,-6.444,0.000,-0.016 -0.009
nwsptot,-0.0422,0.004,-10.460,0.000,-0.050 -0.034
netuse,-0.1094,0.002,-55.421,0.000,-0.113 -0.106


Filling all NaNs

In [67]:
formula = ('hasrelig_f ~ yrbrn60_f + edurank_f + hincrank_f +'
           'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
res = run_model(df, formula)
res.summary()

0,1,2,3
Dep. Variable:,hasrelig_f,No. Observations:,242106.0
Model:,Logit,Df Residuals:,242098.0
Method:,MLE,Df Model:,7.0
Date:,"Fri, 30 Oct 2015",Pseudo R-squ.:,0.02552
Time:,16:29:30,Log-Likelihood:,-154300.0
converged:,True,LL-Null:,-158340.0
,,LLR p-value:,0.0

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,1.0577,0.016,64.699,0.000,1.026 1.090
yrbrn60_f,-0.0081,0.000,-29.894,0.000,-0.009 -0.008
edurank_f,-0.0580,0.017,-3.501,0.000,-0.091 -0.026
hincrank_f,0.0947,0.016,6.092,0.000,0.064 0.125
tvtot_f,-0.0112,0.002,-5.133,0.000,-0.015 -0.007
rdtot_f,-0.0150,0.002,-9.115,0.000,-0.018 -0.012
nwsptot_f,-0.0397,0.004,-11.323,0.000,-0.047 -0.033
netuse_f,-0.0964,0.002,-57.512,0.000,-0.100 -0.093


In [68]:
def extract_res(res, var='netuse_f'):
    param = res.params[var]
    pvalue = res.pvalues[var]
    stars = '**' if pvalue < 0.01 else '*' if pvalue < 0.05 else ''
    return res.nobs, param, stars

extract_res(res)

(242106, -0.096352395424895404, '**')

Group by country:

In [69]:
grouped = df.groupby('cntry')

Run a sample country

In [70]:
gb = grouped.get_group('DK')
run_model(gb, formula).summary()

0,1,2,3
Dep. Variable:,hasrelig_f,No. Observations:,7684.0
Model:,Logit,Df Residuals:,7676.0
Method:,MLE,Df Model:,7.0
Date:,"Fri, 30 Oct 2015",Pseudo R-squ.:,0.0357
Time:,16:29:30,Log-Likelihood:,-5023.8
converged:,True,LL-Null:,-5209.8
,,LLR p-value:,2.448e-76

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,0.5306,0.102,5.203,0.000,0.331 0.731
yrbrn60_f,-0.0200,0.002,-12.112,0.000,-0.023 -0.017
edurank_f,-0.2391,0.091,-2.642,0.008,-0.416 -0.062
hincrank_f,0.3309,0.087,3.789,0.000,0.160 0.502
tvtot_f,-0.0336,0.013,-2.600,0.009,-0.059 -0.008
rdtot_f,0.0064,0.009,0.728,0.467,-0.011 0.024
nwsptot_f,0.0684,0.023,2.929,0.003,0.023 0.114
netuse_f,-0.0370,0.011,-3.439,0.001,-0.058 -0.016


Run all countries

In [71]:
for name, group in grouped:
    try:
        res = run_model(group, formula)
        nobs, param, stars = extract_res(res)
        arrow = '<--' if stars and param > 0 else ''
        print(name, len(group), nobs, '%0.3g'%param, stars, arrow, sep='\t')
    except:
        print(name, len(group), ' ', 'NA', sep='\t')

AT	6918	6918	-0.052	**	
BE	8939	8939	-0.0236	**	
BG	6064	6064	0.019		
CH	9310	9310	-0.0568	**	
CY	3293	3293	-0.0376		
CZ	8790	8790	-0.0375	**	
DE	14487	14487	-0.0166	*	
DK	7684	7684	-0.037	**	
EE	6960	6960	-0.057	**	
ES	9729	9729	-0.0757	**	
FI	9991	9991	-0.0484	**	
FR	9096	9096	0.0134		
GB	11117	11117	-0.018	*	
GR	9759	9759	-0.0458	**	
HR	3133	3133	-0.0109		
HU	7806	7806	-0.00126		
IE	10472	10472	-0.0535	**	
IL	7283	7283	0.0741	**	<--
IS	579	579	0.0545		
IT	1207	1207	-0.0261		
LT	1677	1677	0.018		
LU	3187	3187	-0.0639	**	
LV	1980	1980	-0.0117		
NL	9741	9741	-0.0405	**	
NO	8643	8643	-0.0277	**	
PL	8917	8917	-0.111	**	
PT	10302	10302	-0.0936	**	
RO	2146	2146	0.0353		
RU	7544	7544	0.0417	**	<--
SE	9201	9201	-0.0244	*	
SI	7126	7126	-0.026	*	
SK	6944	6944	-0.0592	**	
TR	4272	4272	-0.0492		
UA	7809	7809	-0.0584	**	
