In [None]:
import os 
import sys
import json
import pyspark.sql.functions as F
import pyspark.sql.types as Types
from pyspark.sql.window import Window
import pandas as pd
import datetime
import itertools
import copy
import time
import cx_Oracle
import pymc3 as pm
import arviz as az
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn import preprocessing
from sklearn.model.selection import train_test_split
from sklearn.metrics import (roc_curve, roc_auc_score, confusion_matrix, accuracy_score, f1_score, precision recall_ curve)

In [None]:
#DISPLAY DATA
total = len(data)
plt.figure(figsize=(7,5)) 
g = sns.countplot(x = 'Interesting', data = data)
g.set_ylabel( 'Count', fontsize=14)
for p in g.patches: 
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
        height + 1.5,
        '{:1.2f%'.format(height/total*100),
        ha="center", fontsize=14, fontweight='bold')
    pit.margins(y=0.1)
    plt.show() 

In [None]:
n_fts = len(data.columns)
colors = cm.rainbow(np.linspace(0, 1, n_fts))
data.drop('Interesting' ,axis=1).corrwith(data.Interesting).sort_values(ascending=True).plot(kind='barh',color=colors, figsize=(12, 6))
plt.title( 'Correlation to Target (Interesting)')
plt.show() 
print ('\n', data.drop( 'Interesting', axis=1) corrwith(data.Interesting).sort_values(ascending=False))

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
X = data.iloc[:,0: -1]
y = data.iloc[:, -1] 
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.05, random_state=0)
train_index, test_index = next(sss.split(X, y))
fig, axes = plt.subplots(1,2, figsize= (10,5))
for split, title, ax in zip( [train_index, test_index], ['Train split','Test split'l, 
                            axes.flatten()) :
    sns.countplot(y[split], ax=ax).set_title(title)   

In [None]:
with pm.Model() as binomial_ regression _model:
    #Define Priors
    beta_ Interesting
    beta_feature1 = pm.TruncatedNormal('featurel', mu=1, sd =0.5, lower = 0, upper =1)
    beta_feature2 = pm.TruncatedNormal('feature2', mu=1, sd =0.5, lower = 0, upper =1)
    beta_feature3 = pm.TruncatedNormal('feature3', mu=1, sd =0.5, lower = 0, upper =1)
    beta_feature4 = pm.TruncatedNormal('feature4', mu=1, sd =0.5, lower = 0, upper =1)
    beta_feature5 = pm.TruncatedNormal('feature5', mu=1, sd =0.5, lower = 0, upper =1)
    beta_feature6 = pm.TruncatedNormal('feature6', mu=1, sd =0.5, lower = 0, upper =1)
    beta_feature7 = pm.TruncatedNormal('feature7', mu=1, sd =0.5, lower = 0, upper =1)
    beta_feature8 = pm.TruncatedNormal('feature8', mu=1, sd =0.5, lower = 0, upper =1)
    beta_feature9 = pm.TruncatedNormal('feature9', mu=1, sd =0.5, lower = 0, upper = 1)
    beta_feature10 = pm.TruncatedNormal('feature10', mu=1, sd =0.5, lower = 0, upper 1) 
    
    #Construct the model formula
    formula = (beta_feature1 * data.featurel + beta_feature2 * data.feature2 + 
               beta_feature3 * data.feature3 + beta_feature4 * data.feature4 + 
               beta_feature5 * data.feature5 + beta_feature6 * data.feature6 + 
               beta_feature7 * data.feature7 + beta_feature8 * data.feature8 + 
               beta_feature9 * data.feature9 + beta_feature10 * data.feature10) / 
               (beta_featurel + beta_feature2 + beta_feature3 + beta_feature4 + beta_feature5 + beta_feature6 +
             beta_feature7 + beta_feature8 + beta_feature9 + beta_feature10)
        
    #Compute raw scores as deterministic purely for tracking
    raw = pm.Deterministic('raw' ,formula)
    #transform formula values with Link function
    p = pm.Deterministic('p', pm.math.invlogit(raw))
    #Create prior for measurable outcome
    outcome = pm.Bernoulli ('pred_interesting', p, observed = data.Interesting)
    `

In [None]:
# MC SAMPLING
with binomial_ regression _model:
    trace = pm.sample(tune = 2000, draws= 100, chains=2, init = 'adapt_diag' ,cores=2)
    

In [None]:
# check trace of sampling
pm.plot_trace(trace)

In [None]:
# RANK THE FEATURES
tt = pm.summary(trace)
tt.loc[['featurel','feature2','feature3','feature4',' feature5','feature6',
        'feature7','feature8','feature9','feature10'],:]['mean'].sort_values(ascending = False)

In [None]:
# see confidence of information

In [None]:
b = trace['feature1']
1b, ub = np.percentile(b, 2.5), np.percentile(b, 97.5)
print(f'P({lb:.3f} < Odds Ratio - featurel ‹ {ub:.3f}) = 0.95')

b = trace['feature2']
1b, ub = np. percentile(b, 2.5), np.percentile(b, 97.5)
print(f'P({lb:.3f} < Odds Ratio - feature2 < {ub:.3f}) = 0.95')

b = trace['feature3']
1b, ub = np. percentile(b, 2.5), np.percentile(b, 97.5)
print(f'P({lb:.3f} < Odds Ratio - feature3 ‹ {ub:.3f}) = 0.95')

b = trace['feature4' ]
1b, ub = np. percentile(b, 2.5), np.percentile(b, 97.5)
print(f'P({1b:.3f} < Odds Ratio - feature4 ‹ {ub:3f}) = 0.95')

b = trace['feature5']
1b, ub = np.percentile(b, 2.5), np.percentile(b, 97.5)
print(f'P({lb:.3f} < Odds Ratio - feature5 ‹ {ub:.3f}) = 0.95')

b = trace['feature6'] 
1b, ub = np. percentile(b, 2.5), np.percentile(b, 97.5)
print(f'P({1b:.3f} < Odds Ratio - feature6 ‹ {ub:.3f}) = 0.95')

b = trace['feature7']
1b, ub = np.percentile(b, 2.5), np.percentile(b,9/.5)
print(f'P({1b:.3f} < Odds Ratio - feature7 < (ub:.3f}) = 0.95')
      
b = trace['feature8']
1b, ub = np. percentile(b, 2.5), np.percentile(b, 97.5)
print(f'P({1b: 3f} < Odds Ratio - feature8 < {ub:.3f}) = 0.95')

b = trace['feature9']
1b, ub = np.percentile(b, 2.5), np.percentile(b, 97.5)
print(f'P({lb:.3f} < Odds Ratio - featureg, < {ub:.3f}) = 0.95')

b = trace['feature10']
1b, ub = np. percentile(b, 2.5), np.percentile(b, 97.5) 
print(f'P({1b:.3f} < Odds Ratio - feature10 ‹ {ub:.3f}) = 0.95')
