## This is the code for the notebook demonstrating how the matching is performed in the thesis. First,  a setting is created which resembles the GRM in grmpy, meaning data are simulated accordingly

### In the beginning, a couple of packages need to be installed...

In [9]:
#install pymatch and due to a bug pandas needs to be downgraded...
!pip install pymatch
!pip install pandas==0.23.4
#import packages
import numpy
import math
import numpy.random as nprand
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib
import seaborn as sns
import scipy
from scipy import stats
from numpy.random import rand
from numpy.random import seed
from numpy.random import randn
from matplotlib import pyplot as plt
from pymatch.Matcher import Matcher



### Now the dataset is generated

In [10]:
def cov_var(num):
    NumAgents = num
    mean = [0,0,0]
#correlation between x, theta_cog and theta_soc assumed to be zero in the first place
    correlation1 = 0
    correlation2 = 0
    correlation12 = 0
    cov = [[1,correlation1,correlation2],[correlation1,1,correlation12],[correlation2, correlation12, 1]]
    test = nprand.multivariate_normal(mean,cov,NumAgents)
    df_test = pd.DataFrame(data=test, columns=['x','theta_cog','theta_soc'])
    df_test['constant'] = ([1,]*NumAgents)
    df_test['index']=df_test.index
    theta_cog = df_test['theta_cog']
    theta_soc = df_test['theta_soc']
    x = df_test['x']
#creation of measurements
    theta_line_cog = numpy.column_stack((theta_cog,theta_cog,theta_cog,theta_cog))
    theta_line_soc = numpy.column_stack((theta_soc,theta_soc,theta_soc,theta_soc))
    jota_cog = ([[1,1,1,1],]*NumAgents)
    #jota_soc = ([[0.9,0.9,0.9,0.9],]*NumAgents)
    jota_soc = ([[1,1,1,1],]*NumAgents)
    delta = ([[0,0,0,0],]*NumAgents)
    x_row = numpy.column_stack((x,x,x,x))
    v1 = randn(NumAgents)
    v2 = randn(NumAgents)
    v3 = randn(NumAgents)
    v4 = randn(NumAgents)
    epsilon = numpy.column_stack((v1,v2,v3,v4))
    measurement = numpy.multiply(jota_cog,theta_line_cog) + numpy.multiply(jota_soc,theta_line_soc)+ numpy.multiply(delta,x_row) + epsilon
    df_m = pd.DataFrame(data=measurement, columns=['M1','M2','M3','M4'])
    df_m['index'] = df_m.index
    df = df_test.merge(df_m, on='index')
# creation unobservables
    mean_unobs = [0,0,0]
#correlation in the unobverables ==> assumed to be 0
    rho_1 = 0
    rho_2 = 0
    cov_unobs = [[1,0,rho_1],[0,1,rho_2],[rho_1,rho_2,1]]
    unobs = nprand.multivariate_normal(mean_unobs,cov_unobs,NumAgents)
    df_unobs = pd.DataFrame(data=unobs, columns=['u1','u0','uc'])
    df_unobs['index'] = df_unobs.index
    df = df.merge(df_unobs, on='index')
# creation outcomes
    df['Y1'] = 1*df['constant'] + 0*df['x'] +  2*df['theta_cog']+ 0.1*df['theta_soc']+df['u1']
    df['Y0'] = 0*df['constant'] + 0*df['x'] + 1*df['theta_cog'] + 0.1*df['theta_soc'] + df['u0']
    df['x3'] = randn(NumAgents)
# model structure can be choosen freely
    #df['cost'] =  1.1*df['constant'] + -0.39*df['x3'] + 0.52*df['theta_soc'] + -0.58*df['theta_cog']+ df['uc']
    df['cost'] =  0*df['constant'] + 1*df['x3'] - 0.5*df['theta_soc'] + 0.5*df['theta_cog']+ df['uc']
    #df['expected_benefit'] = (df['cost'])
    df['expected_benefit'] = ((df['Y1'] - df['u1']) - (df['Y0'] - df['u0'])) - (df['cost'])
    df['D'] = 0
    df.loc[df.expected_benefit > 0, 'D'] = 1
    df['D_1'] = 0
    df.loc[df.D == 0, 'D_1'] = 1
    df['Y'] = df.Y1
    df.loc[df['D'] == 0, 'Y'] = df.Y0 
    test = df.groupby('D').mean()
    benefit = df['Y1'] - df['Y0']
# define effects
    TT = numpy.mean(df[df.D == 1]["Y1"] - df[df.D == 1]["Y0"])
    ATE = numpy.mean(benefit)
    TUT = numpy.mean(df[df.D == 0]["Y1"] - df[df.D == 0]["Y0"])
# build data sets
    treatment_t = df[df.D == 1]
    control_t = df[df.D == 0]
    return treatment_t, control_t, ATE, TT, TUT

### Now the matching needs to be performed

In [11]:
#Matching in order to identify the ATE
#defining the information sets
relevant=['level_0','u0','index','constant','M1','M2','M3','M4','u1','u2','uc','Y1','Y0','cost','experinced','Y','pred_values','expected_benefit','x','D_1']
minimal=['level_0','u0','index','constant','M1','M2','M3','M4','u1','u2','uc','Y1','Y0','cost','experinced','Y','pred_values','expected_benefit','x','x3','D_1']
only_theta_cog = ['level_0','u0','index','constant','theta_soc','M1','M2','M3','M4','u1','u2','uc','Y1','Y0','cost','experinced','Y','pred_values','expected_benefit','x','D_1']
only_M1 = ['level_0','u0','index','constant','theta_soc','theta_cog','M2','M3','M4','u1','u2','uc','Y1','Y0','cost','experinced','Y','pred_values','expected_benefit','x','D_1']
only_x3 = ['level_0','u0','index','constant','theta_soc','theta_cog','M1','M2','M3','M4','u1','u2','uc','Y1','Y0','cost','experinced','Y','pred_values','expected_benefit','x','D_1']
def matching_all(num, Buchstabe):
#simulate data set    
    data = cov_var(num)
    a = data[0]
    b = data[1]
#calibartaing the matcher
    m=Matcher(a,b,yvar='D',exclude=Buchstabe)
#predict scores
    m.fit_scores(balance=True, nmodels=20)
    m.predict_scores()
    m.plot_scores()
#perform matching
    m.match(method="random", nmatches=1, threshold=0.0001)
    m.record_frequency()
    m.assign_weight_vector()
    df=m.matched_data
    cc = m.compare_continuous(return_table=True)
#for TUT
    m_2=Matcher(b,a,yvar='D',exclude=['level_0','u0','index','constant','M1','M2','M3','M4','u1','u2','uc','Y1','Y0','cost','experinced','Y','pred_values','expected_benefit','x','D_1'])
    m_2.fit_scores(balance=True, nmodels=20)
    m_2.predict_scores()
    m_2.plot_scores()
    m_2.match(method="random", nmatches=1, threshold=0.0001)
    m_2.record_frequency()
    m_2.assign_weight_vector()
    df_2=m_2.matched_data
    cc_2 = m_2.compare_continuous(return_table=True)
#TT
    TT = numpy.mean(df[df.D == 1]["Y"]) -  numpy.mean(df[df.D == 0]["Y"])- data[3]
#TUT
    TUT = numpy.mean(df_2[df_2.D == 1]["Y"]) -  numpy.mean(df_2[df_2.D == 0]["Y"])- data[4]
#ATE    
    return (len(a)/num)*TT + (len(b)/num)*TUT

In [12]:
#Matching in order to identify TT
def matching_all(num, rho_1, rho_2, Buchstabe):
    data = cov_var(num, rho_1, rho_2, Buchstabe)
    a = data[0]
    b = data[1]
    m=Matcher(a,b,yvar='D',exclude=['level_0','u0','index','constant','M1','M2','M3','M4','u1','u2','uc','Y1','Y0','cost','experinced','Y','pred_values','expected_benefit','x','D_1'])
    m.fit_scores(balance=True, nmodels=100)
    m.predict_scores()
    m.plot_scores()
    m.match(method="random", nmatches=1, threshold=0.0001)
    m.record_frequency()
    m.assign_weight_vector()
    df=m.matched_data
    cc = m.compare_continuous(return_table=True)
    return numpy.mean(df[df.D == 1]["Y"]) -  numpy.mean(df[df.D == 0]["Y"])- data[3]

### Last but not least, the code for the OLS is displayed

In [None]:
V_1 = 'Y ~ D + x + x3+ theta_cog+ theta_soc'
W_1 = 'Y ~ D + x + theta_cog+ theta_soc'
X_1 = 'Y ~ D + x + x3 + theta_cog'
Y_1 = 'Y ~ D + x + x3 + M1'
Z_1 = 'Y ~ D + x + x3'
def cov_var(num):
    NumAgents = num
    mean = [0,0,0]
#correlation between x, theta_cog and theta_soc assumed to be zero in the first place
    correlation1 = 0
    correlation2 = 0
    correlation12 = 0
    cov = [[1,correlation1,correlation2],[correlation1,1,correlation12],[correlation2, correlation12, 1]]
    test = nprand.multivariate_normal(mean,cov,NumAgents)
    df_test = pd.DataFrame(data=test, columns=['x','theta_cog','theta_soc'])
    df_test['constant'] = ([1,]*NumAgents)
    df_test['index']=df_test.index
    theta_cog = df_test['theta_cog']
    theta_soc = df_test['theta_soc']
    x = df_test['x']
#creation of measurements
    theta_line_cog = numpy.column_stack((theta_cog,theta_cog,theta_cog,theta_cog))
    theta_line_soc = numpy.column_stack((theta_soc,theta_soc,theta_soc,theta_soc))
    jota_cog = ([[1,1,1,1],]*NumAgents)
    #jota_soc = ([[0.9,0.9,0.9,0.9],]*NumAgents)
    jota_soc = ([[1,1,1,1],]*NumAgents)
    delta = ([[0,0,0,0],]*NumAgents)
    x_row = numpy.column_stack((x,x,x,x))
    v1 = randn(NumAgents)
    v2 = randn(NumAgents)
    v3 = randn(NumAgents)
    v4 = randn(NumAgents)
    epsilon = numpy.column_stack((v1,v2,v3,v4))
    measurement = numpy.multiply(jota_cog,theta_line_cog) + numpy.multiply(jota_soc,theta_line_soc)+ numpy.multiply(delta,x_row) + epsilon
    df_m = pd.DataFrame(data=measurement, columns=['M1','M2','M3','M4'])
    df_m['index'] = df_m.index
    df = df_test.merge(df_m, on='index')
# creation unobservables
    mean_unobs = [0,0,0]
#correlation in the unobverables ==> assumed to be 0
    rho_1 = 0
    rho_2 = 0
    cov_unobs = [[1,0,rho_1],[0,1,rho_2],[rho_1,rho_2,1]]
    unobs = nprand.multivariate_normal(mean_unobs,cov_unobs,NumAgents)
    df_unobs = pd.DataFrame(data=unobs, columns=['u1','u0','uc'])
    df_unobs['index'] = df_unobs.index
    df = df.merge(df_unobs, on='index')
# creation outcomes
    df['Y1'] = 1*df['constant'] + 0*df['x'] +  2*df['theta_cog']+ 0.1*df['theta_soc']+df['u1']
    df['Y0'] = 0*df['constant'] + 0*df['x'] + 1*df['theta_cog'] + 0.1*df['theta_soc'] + df['u0']
    df['x3'] = randn(NumAgents)
# model structure can be choosen freely
    #df['cost'] =  1.1*df['constant'] + -0.39*df['x3'] + 0.52*df['theta_soc'] + -0.58*df['theta_cog']+ df['uc']
    df['cost'] =  0*df['constant'] + 1*df['x3'] - 0.5*df['theta_soc'] + 0.5*df['theta_cog']+ df['uc']
#choose which cost function should be caluclated    
    #df['expected_benefit'] = (df['cost'])
    df['expected_benefit'] = ((df['Y1'] - df['u1']) - (df['Y0'] - df['u0'])) - (df['cost'])
    df['D'] = 0
    df.loc[df.expected_benefit > 0, 'D'] = 1
    df['D_1'] = 0
    df.loc[df.D == 0, 'D_1'] = 1
    df['Y'] = df.Y1
    df.loc[df['D'] == 0, 'Y'] = df.Y0 
    test = df.groupby('D').mean()
    benefit = df['Y1'] - df['Y0']
# define effects
    TT = numpy.mean(df[df.D == 1]["Y1"] - df[df.D == 1]["Y0"])
    ATE = numpy.mean(benefit)
    TUT = numpy.mean(df[df.D == 0]["Y1"] - df[df.D == 0]["Y0"])
# build data sets
    treatment_t = df[df.D == 1]
    control_t = df[df.D == 0]
#perform OLS
    OLS_psm = smf.ols(Buchstabe, data=df)
    results_OLS = OLS_psm.fit()
    c = results_OLS.params[1]
#calculate bias
    return c - ATE