In [None]:
%load_ext autoreload
%autoreload
import numpy as np
import pandas as pd
# import scipy as sp
# from sklearn import preprocessing, feature_extraction, feature_selection, model_selection, metrics
# import xgboost as xgb
# import matplotlib.pyplot as plt

In [None]:
from utils import set_project_dir
set_project_dir('project_3')

In [None]:
train = pd.read_csv('data/in/train.csv')
test = pd.read_csv('data/in/test.csv')
y_column = 'Survived'

# bayesian statistics
posterior probability:
\begin{equation}P(y|x_i) = \frac{P(x_i|y)P(y)}{P(x_i)}\end{equation}
- class $y$, features $x_i$
- $P(y)$ class prior probability
- $P(x_i)$ predictor prior probability
- $P(x_i|y)$ likelihood

## categorical features
### basics
example:

- $x_i$ = Pclass
- frequency table with $n_{kl}$ for $y=k$ and $x_i=l$:

In [None]:
pd.crosstab(train['Pclass'], train[y_column], margins=True)

- $N = \sum_{kl} n_{kl}$
- $N_k = \sum_l n_{kl}$
- $N_l = \sum_k n_{kl}$
- likelihood $P(x_i=k | y=l) = n_{kl}/N_l$
- class prior $P(y=l) = N_l/N$
- predictor prior $P(x_i=k) = N_k/N$

In [None]:
crosstab = pd.crosstab(train['Pclass'], train[y_column], normalize='columns', margins=True)
predictor_prior = crosstab['All']
likelihood = crosstab[[0, 1]]

crosstab = pd.crosstab(train['Pclass'], train[y_column], normalize='index', margins=True)
class_prior = crosstab.loc['All']

- unnormalized posterior: $\tilde p_{kl} := P(x_i|y) P(y)$:

In [None]:
p = likelihood * class_prior
p

- either use predictor prior $P(x_i)$ for normalization:

In [None]:
posterior = p / pd.DataFrame({col: predictor_prior for col in p.columns})
posterior

- or normalize by hand: $\tilde p_{kl} / \sum_l \tilde p_{kl}$:

In [None]:
norm = p.sum(axis=1)
posterior = p / pd.DataFrame({col: norm for col in p.columns})
posterior

#### predict

In [None]:
predict = train[['Pclass', 'Survived']]
predict = predict.join(posterior, on='Pclass')
predict['Survived_pred'] = predict[[0,1]].idxmax(axis=1)
predict.head()

#### validate

In [None]:
pred_counts = pd.crosstab(predict['Survived'], predict['Survived_pred'], margins='all')
pred_counts

In [None]:
accuracy = (pred_counts.loc[1,1] + pred_counts.loc[0,0]) / pred_counts.loc['All', 'All']
precision = pred_counts.loc[1,1] / pred_counts.loc['All',1]
recall = pred_counts.loc[1,1] / pred_counts.loc[1,'All']
accuracy, precision, recall

### multiple features

In [None]:
x_columns = ['Pclass', 'Embarked']
predictor_prior = dict()
likelihood = dict()
class_prior = dict()

for x_i in x_columns:
    crosstab = pd.crosstab(train[x_i], train[y_column], normalize='columns', margins=True)
    predictor_prior[x_i] = crosstab['All']
    likelihood[x_i] = crosstab[[0, 1]]

# class prior does not depend on features, so is the same for all
crosstab = pd.crosstab(train[x_columns[0]], train[y_column], normalize='index', margins=True)
class_prior = crosstab.loc['All']

$\tilde p_{kl} = \left[\prod_i P(x_i|y) \right] P(y)$:

In [None]:
def df_crossjoin(df1, df2, **kwargs):
    df1['_tmpkey'] = 1
    df2['_tmpkey'] = 1

    res = pd.merge(df1, df2, on='_tmpkey', suffixes=('', '_2'), **kwargs).drop('_tmpkey', axis=1)
    res.index = pd.MultiIndex.from_product((df1.index, df2.index))

    df1.drop('_tmpkey', axis=1, inplace=True)
    df2.drop('_tmpkey', axis=1, inplace=True)

    return res

temp = df_crossjoin(likelihood[x_columns[0]], likelihood[x_columns[1]])
temp['0'] = temp['0'] * temp['0_2']
temp['1'] = temp['1'] * temp['1_2']
p = temp.drop(['0_2', '1_2'], axis=1).rename(lambda x: int(x), axis=1) * class_prior
p

normalize each row by hand:

In [None]:
norm = p.sum(axis=1)
posterior = p / pd.DataFrame({col: norm for col in p.columns})
posterior

normalization with predictor priors does not work...

In [None]:
temp = df_crossjoin(pd.DataFrame(predictor_prior[x_columns[0]]), pd.DataFrame(predictor_prior[x_columns[1]]))
temp['All'] = temp['All'] * temp['All_2']
temp['All_2'] = temp['All']
temp.rename({'All': 0, 'All_2': 1}, axis=1, inplace=True)
p / temp

#### predict

In [None]:
predict = train[x_columns + [y_column]]
predict = predict.join(posterior, on=x_columns)
predict['Survived_pred'] = predict[[0,1]].idxmax(axis=1)
predict.head()

#### validate

In [None]:
pred_counts = pd.crosstab(predict['Survived'], predict['Survived_pred'], margins='all')
pred_counts

In [None]:
accuracy = (pred_counts.loc[1,1] + pred_counts.loc[0,0]) / pred_counts.loc['All', 'All']
precision = pred_counts.loc[1,1] / pred_counts.loc['All',1]
recall = pred_counts.loc[1,1] / pred_counts.loc[1,'All']
accuracy, precision, recall

## continuous features

### basics
two (three) possibilities:
1. use a normal distribution
    - best, when $P(x_i|y)$ is normal distributed
2. discretize continuous features
    - if distribution not normal
3. use a kernel density estimator
    - best, but introduces KDE bandwidth as new hyperparameter

### gaussian
apparently, the likelihoods are not normal distributed. But we continue anyways:

In [None]:
x_column = 'Age'
train[train[y_column]==0].hist(x_column)
train[train[y_column]==1].hist(x_column)

- compute mean and std of $x_i$ for each class outcome $y=l$
(clean outliers before, because mean is very sensitive to them):

In [None]:
params = pd.DataFrame(columns=[0,1], index=['mean', 'std'])
for col in params.columns:
    temp = train.loc[train[y_column]==col,x_column]
    params[col] = [temp.mean(), temp.std()]
params

#### predict
- likelihood $P(x_i|y=l) ~ N(\mu^i_l, \sigma^i_l)$ is continuous and gaussian distributed
- compute $P(x_i=k|y=l)$ for every observation k, finally multiply by class prior
- normalize rows individually

In [None]:
def normal(x, mean, std):
    return 1/(np.sqrt(2*np.pi)*std) * np.exp(-(x-mean)**2/(2*std**2))

def predict(series, params, class_prior):
    pred = pd.DataFrame(1, columns=params.columns, index=series.index)
    for col in params.columns:
        pred[col] = normal(series, *list(params[col]))
    pred *= class_prior
    
    norm = pred.sum(axis=1)
    return pred / pd.DataFrame({col: norm for col in pred.columns})

crosstab = pd.crosstab(train[x_column], train[y_column], normalize='index', margins=True)
class_prior = crosstab.loc['All']
predict = predict(train[x_column], params, class_prior)
predict[y_column] = train[y_column]
predict['Survived_pred'] = predict[[0,1]].idxmax(axis=1)
predict.head()

#### validate

In [None]:
pred_counts = pd.crosstab(predict['Survived'], predict['Survived_pred'], margins='all')
pred_counts

In [None]:
accuracy = (pred_counts.loc[1,1] + pred_counts.loc[0,0]) / pred_counts.loc['All', 'All']
precision = pred_counts.loc[1,1] / pred_counts.loc['All',1]
recall = pred_counts.loc[1,1] / pred_counts.loc[1,'All']
accuracy, precision, recall

# naive bayes with sklearn

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

## categorical

In [None]:
from sklearn.preprocessing import LabelEncoder

we first have to transform our dataset into a contingency table (which is: counts for each feature outcome given the class outcome)

In [None]:
train_transf = train[['Pclass', 'Embarked', 'Survived']]
train_transf.set_index(['Pclass', 'Embarked'], inplace=True)
pd.crosstab(train_transf.index, train_transf['Survived'])

In [None]:
pd.get_dummies(train[['Survived', 'Pclass', 'Embarked']], columns=['Pclass', 'Embarked']).groupby('Survived').sum()
pd.DataFrame(index=)

In [None]:
x_columns = ['Pclass']
y_column = 'Survived'

train_transf = train[x_columns+[y_column]].copy().dropna()
train_transf['Pclass'] = train_transf['Pclass'] - 1
le = LabelEncoder()
if 'Embarked' in x_columns:
    le.fit(list(train_transf['Embarked'].unique()))
    train_transf['Embarked'] = le.transform(train_transf['Embarked'].fillna('nan'))

model = MultinomialNB(alpha=0, fit_prior=True, class_prior=np.array(class_prior))
# model.fit(np.asarray(train_transf[x_columns]), np.asarray(train_transf[y_column]))
model.partial_fit(np.asarray(train_transf[x_columns]), np.asarray(train_transf[y_column]),
                  classes=np.asarray(train_transf[y_column].unique()))
model.predict_proba(train_transf[x_columns])

# model.score(np.asarray(train_transf[x_columns]), np.asarray(train_transf[y_column]))

In [None]:
predict

## gaussian

In [None]:
x_columns = ['Age']
train_transf = train[x_columns+[y_column]].copy().dropna()
temp = train_transf[x_columns]
# normalize feature columns (apparently, this is done in GaussianNB already)
# train_transf[x_columns] = (train_transf[x_columns] - temp.mean())/temp.std()
model = GaussianNB()
model.fit(train_transf[x_columns], train_transf[y_column])
model.predict_proba(train_transf[x_columns])[:10]
# model.score(train_transf[x_columns], train_transf[y_column])

In [None]:
predict

## minimal example sklearn categorical

In [None]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

train = pd.read_csv('data/in/train.csv')

X_test = np.asarray(pd.get_dummies(train['Pclass']))
# y_test = np.asarray(train['Survived'])

X = np.array(pd.crosstab(train[y_column], train['Pclass']))
y = np.array([0, 1])

clf = MultinomialNB(alpha=0.0000000001, fit_prior=True, class_prior=np.array(class_prior))
clf.fit(X, y)
clf.predict_proba(X_test)

In [None]:
predict