In [24]:
import pymc3 as pm
import pandas as pd
import numpy as np
import theano.tensor as T
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
from theano import shared
import theano
from utils import cost_matrix

In [4]:
df = pd.read_csv("../data/dmc2001_learn.txt", sep=";")
df = df.fillna(df.mean(numeric_only=True))
# Preprocessing: let's assume all your columns are numerical for simplicity
df['WO'] = df['WO'].replace({'W': 1, 'O': 0, 'F': 2})
train, valid = train_test_split(df, test_size=0.2)

In [11]:
df_test_data = pd.read_csv("../data/dmc2001_class.txt", sep=";")
df_test_data = df_test_data.fillna(df_test_data.mean(numeric_only=True))
# Preprocessing: let's assume all your columns are numerical for simplicity
df_test_data['WO'] = df['WO'].replace({'W': 1, 'O': 0, 'F': 2})
df_test_labels = pd.read_csv("../data/dmc2001_resp.txt", sep=";")
df_test_data.shape

(18128, 34)

In [16]:
train, valid = train_test_split(df, test_size=0.2)

In [18]:
print(f"train {train.shape}, valid {valid.shape}, test {df_test_data.shape} ")

train (8000, 35), valid (2000, 35), test (18128, 34) 


In [12]:
def create_shared_data(data):
    shared_data = {}
    for column in data.columns:
        shared_data[column] = theano.shared(data[column].values)
    return shared_data

In [19]:
shared_train_data = create_shared_data(train.drop('AKTIV', axis=1))
shared_valid_data = create_shared_data(valid.drop('AKTIV', axis=1))
shared_test_data = create_shared_data(test.drop('AKTIV', axis=1))

In [20]:
with pm.Model() as model:
    # Priors for each feature
    priors = {col: pm.Normal(col, mu=0, sd=1) for col in train.columns if col != "AKTIV"}

    # Expected value using logistic function
    mu = pm.math.sigmoid(sum([priors[col] * shared_train_data[col] for col in train.columns if col != "AKTIV"]))

    # Likelihood
    AKTIV = pm.Bernoulli('AKTIV', p=mu, observed=train['AKTIV'])

    # Sample
    trace = pm.sample(2000, tune=1000)

# Switch to test data and generate posterior predictive samples
for column in valid.columns:
    if column in shared_valid_data:
        shared_train_data[column].set_value(valid[column].values)

with model:
    ppc_valid = pm.sample_posterior_predictive(trace, samples=500)

for column in test.columns:
    if column in shared_test_data:
        shared_train_data[column].set_value(test[column].values)

with model:
    ppc_test = pm.sample_posterior_predictive(trace, samples=500)

  trace = pm.sample(2000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [PHARM6, PHARM5, PHARM4, PHARM3, PHARM2, PHARM1, Abogr, Typ9, Typ8, Typ7, Typ6, Typ5, Typ4, Typ3, Typ2, Typ1, PKW_Gel, PKW_GW, PKW_KB, PKW_Lei, PKW_Di, AnzGew, AnzHH, AntDt, Altersgr, Famgr, Bonitaet, Strtyp, Bebautyp, Kaufkraft, Regiotyp, WO, jahrstart, ID]


Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 862 seconds.
The acceptance probability does not match the target. It is 0.9689151270052351, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.9470364251074301, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.9424812934751857, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.9573953291573671, but should be close to 0.8. Try to increase the number of tuning steps.




In [22]:
prediction =  np.mean(ppc_valid["AKTIV"], axis=0)
pred = [1 if x > 0.5 else 0 for x in prediction] 
lables_valid = valid["AKTIV"].to_list()
correct_predictions = sum(p == l for p, l in zip(pred, lables_valid))
accuracy = correct_predictions / len(lables_valid) * 100
accuracy

61.550000000000004

In [23]:
prediction =  np.mean(ppc_test["AKTIV"], axis=0)
pred = [1 if x > 0.5 else 0 for x in prediction] 
lables_test = test["AKTIV"].to_list()
correct_predictions = sum(p == l for p, l in zip(pred, lables_test))
accuracy = correct_predictions / len(lables_test) * 100
accuracy

61.6

In [25]:
cost_matrix(pred, lables_test)

true_positive 720 false_positive 496 true_negative 512 false_negative 272


(549272, 720, 496, 512, 272)

In [29]:
lables_test = test["AKTIV"].to_list()
len(lables_test)

2000