In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
from matplotlib import cm
from sklearn.linear_model import LinearRegression
import numpy as np
import statsmodels.api as sm
from sklearn.datasets import make_blobs
import seaborn as sns
plt.style.use('seaborn')

In [2]:
df = pd.read_stata('Staat vd Stad.dta')
pd.set_option('display.max_columns', None)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.


In [3]:
df['V4']=df['V4'].map({'rarely ever or never': '2-low', 'at least once a week': '0-intensive', 'once a month':'1-moderate', 'twice a month': '1-moderate', 'less than once a month': '2-low' })
df=df.drop(df[df['V4']=="don't know"].index)
df=df.drop(df[df['V4']=="not filled in"].index)
df['V4'].value_counts(normalize=True)

2-low          0.426266
0-intensive    0.287025
1-moderate     0.286709
Name: V4, dtype: float64

In [4]:
df['V8'].replace('not filled in', 'no', inplace=True)
df['V8'] = pd.get_dummies(df['V8'])
# df['V8'].value_counts()

In [5]:
df['V5'] = df['V5'].map({'fairly': 2, 'a lot': 3, 'a little': 1, 'not': 0, 'not filled in': 0})

In [6]:
df=df.drop(df[df['V12']=="not filled in"].index)
df['V12'] = pd.get_dummies(df['V12'])

In [7]:
df=df.drop(df[df['V16']=="niet ingevuld"].index)
df['V16'] = df['V16'].map({'nee': 0, 'weet niet': 0, 'ja, misschien': 1, 'ja, zeker': 2})

In [8]:
famdum = pd.get_dummies(df['hhcom'])

In [9]:
df['children'] = famdum['couple with child(ren) at home'] + famdum['one parent with child(ren) at home'] + famdum['two adults, with child(ren) at home'] + famdum['three or more adults, with child(ren) at home']

In [10]:
df['couple'] = famdum['couple without children (at home)'] + famdum['couple with child(ren) at home']

In [11]:
df['houseshare'] = famdum['two adults, without children (at home)'] + famdum['two adults, with child(ren) at home'] + famdum['three or more adults, with child(ren) at home'] + famdum['three or more adults, without children at home']

In [12]:
df['V76a'].value_counts()

The Netherlands                                                                     2267
Europe (including former Sovjet-Republics)                                           150
Morocco                                                                              139
Surinam                                                                              138
Turkey                                                                               132
remaining Asia, namely                                                                83
unknown (no answer)                                                                   75
US, Canada, Australia, New Zealand, remaining Oceania, Japan, Indonesia, The Dut      65
remaining Middle- and South Amerika, namely                                           48
The Antillies, Aruba                                                                  42
remaining Africa, namely                                                              35
Ghana                

In [13]:
df=df.drop(df[df['V76a']=="unknown (no answer)"].index)
df['V76a'] = df['V76a'].map({'The Netherlands': 'Dutch Native', 'Europe (including former Sovjet-Republics)': 'Europe', 'Ghana': 'Africa', 'remaining Africa, namely': 'Africa', 'Surinam': 'Central and South America', 'The Antillies, Aruba': 'Central and South America', 'remaining Asia, namely':'Asia', 'US, Canada, Australia, New Zealand, remaining Oceania, Japan, Indonesia, The Dut':'Others', 'remaining Middle- and South Amerika, namely':'Central and South America'})

In [14]:
countrydum = pd.get_dummies(df['V76a'])
df['V76a'].value_counts(normalize=True)

Dutch Native                 0.794881
Central and South America    0.079944
Europe                       0.052595
Asia                         0.029102
Others                       0.022791
Africa                       0.020687
Name: V76a, dtype: float64

In [15]:
df.dropna(subset=['V4'], inplace=True)
df_X = pd.DataFrame()
df_X[['Trust in Neighbours', 'Neighbourhood Safety', 'House Owner', 'Plans to move']]=df[['V5', 'V8', 'V12', 'V16']]
df_X[['Africa', 'Asia', 'Central and South America', 'Dutch Native', 'Europe', 'Others']] = countrydum
df_X[['children', 'couple', 'houseshare']] = df[['children', 'couple', 'houseshare']]
df_X.shape

(3069, 13)

In [20]:
Y = df['V4']
X = sm.add_constant(df_X, prepend = False)
X['Trust in Neighbours'].value_counts(normalize=True)

2    0.561095
3    0.259042
1    0.104920
0    0.074943
Name: Trust in Neighbours, dtype: float64

In [17]:
model=sm.MNLogit(Y, X.astype(float), missing='raise')
model_fit = model.fit()

Optimization terminated successfully.
         Current function value: 1.011183
         Iterations 5


In [18]:
print(model_fit.summary())

                          MNLogit Regression Results                          
Dep. Variable:                     V4   No. Observations:                 3069
Model:                        MNLogit   Df Residuals:                     3041
Method:                           MLE   Df Model:                           26
Date:                Sat, 19 Sep 2020   Pseudo R-squ.:                 0.06400
Time:                        14:42:10   Log-Likelihood:                -3103.3
converged:                       True   LL-Null:                       -3315.5
Covariance Type:            nonrobust   LLR p-value:                 1.286e-73
            V4=1-moderate       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Trust in Neighbours          -0.2254      0.074     -3.067      0.002      -0.369      -0.081
Neighbourhood Safety          0.2074      0.109      1.908      0.056      -0.006     