In [1]:
import pymc3 as pm
import pandas as pd
import numpy as np
import theano.tensor as T
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
from theano import shared
import theano

In [5]:
df = pd.read_csv("../data/dmc2001_learn.txt", sep=";")
df_learn_mean = df.fillna(df.mean(numeric_only=True))
# Preprocessing: let's assume all your columns are numerical for simplicity
df_learn_mean['WO'] = df_learn_mean['WO'].replace({'W': 1, 'O': 0, 'F': 2})


In [7]:
corr_matrix = df_learn_mean.corr()
corr_with_target = corr_matrix['AKTIV'].sort_values(ascending=False)
threathold = 0.05
relevant_features = corr_with_target[corr_with_target.abs() > threathold].index.tolist()
relevant_features.remove('AKTIV')
relevant_features

['ID',
 'jahrstart',
 'Bonitaet',
 'Bebautyp',
 'PKW_GW',
 'PHARM2',
 'Famgr',
 'PKW_Di',
 'PHARM6',
 'Altersgr',
 'AntDt']

In [9]:
train, valid = train_test_split(df, test_size=0.2)
df_test_data = pd.read_csv("../data/dmc2001_class.txt", sep=";")
df_test_data = df_test_data.fillna(df_test_data.mean(numeric_only=True))
# Preprocessing: let's assume all your columns are numerical for simplicity
df_test_data['WO'] = df['WO'].replace({'W': 1, 'O': 0, 'F': 2})
df_test_labels = pd.read_csv("../data/dmc2001_resp.txt", sep=";")
df_test_data.shape

(18128, 34)

In [3]:
train, valid = train_test_split(df, test_size=0.2)
print(f"train {train.shape}, valid {valid.shape}, test {df_test_data.shape} ")

train (8000, 35), valid (2000, 35), test (18128, 34) 


In [8]:
def create_shared_data(data):
    shared_data = {}
    for column in data.columns:
        shared_data[column] = theano.shared(data[column].values)
    return shared_data

In [12]:
shared_train_data = create_shared_data(train.drop('AKTIV', axis=1))
shared_valid_data = create_shared_data(valid.drop('AKTIV', axis=1))
shared_test_data = create_shared_data(df_test_data)

In [None]:
with pm.Model() as model:
    # Priors for each feature
    priors = {feature: pm.Normal(feature, mu=0, sd=1) for feature in relevant_features}

    # Expected value using logistic function
    mu = pm.math.sigmoid(sum(priors[feature]*df_train[feature] for feature in relevant_features))

    # Likelihood
    AKTIV = pm.Bernoulli('AKTIV', p=mu, observed=train['AKTIV'])

    # Sample
    trace = pm.sample(2000, tune=1000)

# Switch to test data and generate posterior predictive samples
for column in valid.columns:
    if column in shared_valid_data:
        shared_train_data[column].set_value(valid[column].values)

with model:
    ppc_valid = pm.sample_posterior_predictive(trace, samples=500)

for column in test.columns:
    if column in shared_test_data:
        shared_train_data[column].set_value(test[column].values)

with model:
    ppc_test = pm.sample_posterior_predictive(trace, samples=500)