# Patsy
#### PyData Berlin
#### Canada Day, 2017
#### @maxhumber

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import random

%matplotlib inline

In [None]:
def create_data():
    N = 1000
    beer = np.random.normal(loc=0, scale=1, size=N)
    warm = np.random.normal(loc=0, scale=1, size=N)
    family = np.random.randint(2, size=N)
    # linear combination
    z = 1 + 2*beer + -3*warm + 0.5*family
    # inv-logit function
    pr = [1 / (1 + np.exp(-i)) for i in z]
    canada = np.random.binomial(1, p=pr, size=N)
    # fake family into factor
    family = np.where(family == 0, 'No', 'Yes')
    return canada, beer, warm, family

In [None]:
np.random.seed(42)
canada, beer, warm, family = create_data()

df = pd.DataFrame({
    'canada':canada, 
    'beer':beer, 
    'warm':warm,
    'family':family})

In [None]:
df.head(10)

In [None]:
df.to_csv('canada.csv', index=False)

# Python Logistic Regression

In [None]:
# 0 - load modules

import pandas as pd
import statsmodels.api as sm

# 1 - load data 

df = pd.read_csv('canada.csv')

# X - dummy-fy 

df_dummy = pd.get_dummies(df, drop_first=True)

# X - create design matrix

X = df_dummy[['beer', 'warm', 'family_Yes']]
y = df_dummy['canada']

# X - add intercept

X['Intercept'] = 1

# 2 - test/train split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

# 3 - model

mod = sm.Logit(y_train, X_train)

# X - fit model ??

result = mod.fit()

# 4 - peak 

result.summary()

# Patsy Logistic Regression

In [None]:
# 0 - load modules

from patsy import dmatrices, build_design_matrices
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.utils import resample

# 1 - load data 

df = pd.read_csv('canada.csv')

# X - build design matrix with patsy 

y, X = dmatrices('canada ~ beer + warm + family', df, return_type='dataframe')
y = np.ravel(y)

# 2 - test/train split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

# 3 - model

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(fit_intercept=False)
model.fit(X_train, y_train)

# 4A - peak

from sklearn.metrics import accuracy_score, roc_auc_score

predicted = model.predict(X_test)
probs = model.predict_proba(X_test)

print(accuracy_score(y_test, predicted))
print(roc_auc_score(y_test, probs[:, 1]))

# 4B - peak

from sklearn.model_selection import cross_val_score

scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

# 4C - peak

from sklearn.metrics import classification_report, confusion_matrix

expected = y_test
predicted = model.predict(X_test)

print(classification_report(expected, predicted))
print(confusion_matrix(expected, predicted))

# Bonus

In [None]:
y_true = y_test
y_pred = model.predict_proba(X_test)[:, 1]

def separation_plot(y_true, y_pred):
    # prepare data
    sp = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
    sp.sort_values('y_pred', inplace=True)
    sp['order'] = sp['y_pred'].rank(method='dense')
    sp['order'] = sp.order.astype(np.int64) 
    sp['height'] = 1
    sp['y_true'] = sp.y_true.astype(np.int64)   
    sp['color'] = ['b' if i == 0 else 'r' for i in sp['y_true']]
    sp = sp.reset_index(drop=True)
    # plot data
    plt.rcParams["figure.figsize"] = (12, 4)
    plt.bar(sp['order'], sp['height'], color=sp['color'], 
        alpha = 0.75, width = 1.01, antialiased=True)
    plt.plot(sp['order'], sp['y_pred'], c='black')
    plt.scatter(sp['y_pred'].sum(), 0.01, c='black', s=100, marker="^")
    plt.xticks([])
    plt.yticks([0, 0.5, 1])
    plt.ylabel('Predicted Value')
    plt.show()

In [None]:
y_true = y_test
y_pred = model.predict_proba(X_test)[:, 1]

separation_plot(y_true, y_pred)

# Patsy Continued

In [None]:
design_info = X.design_info

def patsy_predict(design_info, model, new_data={}):
    new_data = pd.DataFrame(new_data, index=[0])
    print(new_data)
    (new_dmat, ) = build_design_matrices([design_info], new_data)
    return model.predict_proba(new_dmat)[:,1][0]

In [None]:
patsy_predict(design_info, model, {'beer': 1.5, 'warm': -0.5, 'family': 'Yes'})

In [None]:
patsy_predict(design_info, model, {'beer': -0.9, 'warm': 0.3, 'family': 'No'})

# Patsy Extended

In [None]:
import patsy

def easy_scatter(formula, data={}):
    formula += ' - 1'
    y, X = patsy.dmatrices(formula, data, return_type='dataframe')
    y = np.ravel(y)
    return plt.scatter(X[X.columns[0]], X[X.columns[1]], c=y, alpha=0.5)
    
easy_scatter('canada ~ beer + warm', data = df)

# PyMC3

In [None]:
import pymc3 as pm

model = pm.Model()

with model:
    pm.glm.GLM.from_formula(
        'canada ~ beer + warm + family',
        data=df, family=pm.glm.families.Binomial())
    start = pm.find_MAP() # Use Maximum A Posteriori optimization as initial value for MCMC
    step = pm.NUTS(scaling=start) # Instantiate MCMC sampling algorithm
    trace = pm.sample(2000, step, progressbar=True) # draw 2000 posterior samples using

plt.figure(figsize=(7, 7))
pm.traceplot(trace[100:])
plt.tight_layout();

pm.df_summary(trace)