In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
from matplotlib import cm
from sklearn.linear_model import LinearRegression
import numpy as np
import statsmodels.api as sm
from sklearn.datasets import make_blobs
plt.style.use('seaborn')

In [19]:
df = pd.read_stata('WoON2015small.dta')

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.


In [24]:
df=df[['Verh', 'LftOP', 'VltOplOP', 'etniop3', 'WOZwaarde', 'huko', 'g4_5']]
df.dropna(inplace=True)
df.head(30)

Unnamed: 0,Verh,LftOP,VltOplOP,etniop3,WOZwaarde,huko,g4_5
0,niet verhuisgeneigd,20,"HAVO, VWO, MBO",Native Dutch,189000,Rental,Amsterdam
1,verhuisgeneigd,21,"HAVO, VWO, MBO",Native Dutch,783000,Rental,Utrecht
2,verhuisgeneigd,27,"HBO, Universiteit",Native Dutch,206000,Rental,Utrecht
3,niet verhuisgeneigd,57,lbo,Western,324000,Owner-Occupied,Utrecht
4,verhuisgeneigd,70,"MAVO, MULO, VMBO",Native Dutch,215000,Owner-Occupied,Utrecht
5,niet verhuisgeneigd,50,"MAVO, MULO, VMBO",Native Dutch,424000,Owner-Occupied,Utrecht
6,niet verhuisgeneigd,60,"HAVO, VWO, MBO",Native Dutch,169000,Rental,Utrecht
8,verhuisgeneigd,38,"HBO, Universiteit",Native Dutch,201000,Owner-Occupied,Utrecht
9,verhuisgeneigd,19,"HAVO, VWO, MBO",Native Dutch,199000,Rental,Utrecht
10,verhuisgeneigd,28,"HBO, Universiteit",Native Dutch,238000,Rental,Utrecht


In [4]:
df['VltOplOP'].value_counts()

HBO, Universiteit    3129
HAVO, VWO, MBO       2052
Lager onderwijs       633
lbo                   588
MAVO, MULO, VMBO      560
Anders                130
Name: VltOplOP, dtype: int64

In [5]:
df.describe()

Unnamed: 0,LftOP,WOZwaarde
count,7092.0,7092.0
mean,47.280034,196368.1
std,17.882843,139134.9
min,18.0,12000.0
25%,32.0,117000.0
50%,46.0,163000.0
75%,61.0,229000.0
max,98.0,2495500.0


In [6]:
df=df[(df['Verh']!='huisv gevonden')]
df=df[(df['Verh']!='gedwongen verhuizing')]
df['Verh'].value_counts()

niet verhuisgeneigd     3660
verhuisgeneigd          3235
gedwongen verhuizing       0
huisv gevonden             0
Name: Verh, dtype: int64

In [49]:
df['huko'] = pd.get_dummies(df['huko'])
dummies_etni = pd.get_dummies(df['etniop3'], drop_first=True)
df=df.drop(df[df['VltOplOP']=='Anders'].index)
df.replace('Lager onderwijs', value=0, inplace=True)
df.replace('lbo', value=1, inplace=True)
df.replace('MAVO, MULO, VMBO', value=1, inplace=True)
df.replace('HAVO, VWO, MBO', value=2, inplace=True)
df.replace('HBO, Universiteit', value=3, inplace=True)
df['VltOplOP'].value_counts()

3         3129
2         2052
1         1148
0          633
Anders       0
Name: VltOplOP, dtype: int64

In [50]:
dummies_city = pd.get_dummies(df['g4_5'], drop_first=True)
dummies_city

Unnamed: 0,s-Gravenhage,Rotterdam,Utrecht
0,0,0,0
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
7784,1,0,0
7785,1,0,0
7786,1,0,0
7787,1,0,0


In [51]:
df_X = df[['LftOP', 'WOZwaarde', 'huko', 'VltOplOP']]
df_X[['Non-Western', 'Western']]=dummies_etni[['Non-Western', 'Western']]
df_X[['s-Gravenhage', 'Rotterdam', 'Utrecht']]=dummies_city[['s-Gravenhage', 'Rotterdam', 'Utrecht']]
df_X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,LftOP,WOZwaarde,huko,VltOplOP,Non-Western,Western,s-Gravenhage,Rotterdam,Utrecht
0,20,189000,0,2,0,0,0,0,0
1,21,783000,0,2,0,0,0,0,1
2,27,206000,0,3,0,0,0,0,1
3,57,324000,1,1,0,1,0,0,1
4,70,215000,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
7784,42,66000,0,1,1,0,1,0,0
7785,79,106000,0,3,0,0,1,0,0
7786,72,90000,0,2,0,0,1,0,0
7787,22,81000,0,2,0,0,1,0,0


In [52]:
X = df_X # Baseline = Owner, wants to move, Amsterdam
Y = pd.get_dummies(df['Verh'])
X = sm.add_constant(X.astype(float)) # adding a constant
Y

Unnamed: 0,verhuisgeneigd,huisv gevonden,gedwongen verhuizing,niet verhuisgeneigd
0,0,0,0,1
1,1,0,0,0
2,1,0,0,0
3,0,0,0,1
4,1,0,0,0
...,...,...,...,...
7784,1,0,0,0
7785,1,0,0,0
7786,0,0,0,1
7787,1,0,0,0


In [53]:
model = sm.Logit(Y['verhuisgeneigd'].astype(float), X.astype(float)).fit()
predictions = model.predict(X) 
print_model = model.summary()
print(print_model)

Optimization terminated successfully.
         Current function value: 0.606744
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         verhuisgeneigd   No. Observations:                 6962
Model:                          Logit   Df Residuals:                     6952
Method:                           MLE   Df Model:                            9
Date:                Wed, 09 Sep 2020   Pseudo R-squ.:                  0.1202
Time:                        15:56:14   Log-Likelihood:                -4224.2
converged:                       True   LL-Null:                       -4801.2
Covariance Type:            nonrobust   LLR p-value:                1.015e-242
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const            1.8976      0.139     13.680      0.000       1.626       2.170
LftOP           -0.0422

SyntaxError: invalid syntax (<ipython-input-10-823a9a2b31ac>, line 1)