In [1]:
import pandas as pd
import math
import numpy as np

In [38]:
df = pd.read_csv("data/original_dataset.csv")
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

KeyError: 'price'

In [4]:
def norm(x, mean, std):
    return (1 / (math.sqrt(2 * math.pi) * std)) * math.exp(-(math.pow(x - mean, 2))/(2 * math.pow(std, 2)))

In [5]:
def generate_bayesian_model(df: pd.DataFrame, target):
    model = {'classes': df[target].unique(), 'target': target}
    for c in model['classes']:
        model[c] = {}
        for col in df.columns:
            if col != target:
                model[c][col] = {}
                model[c][col]['mean']  = df.where(df[target] == c)[col].mean()
                model[c][col]['std'] = df.where(df[target] == c)[col].std()
    return model

In [6]:
def compute_model(model, row):
    target = None
    p = -1
    for c in model['classes']:
        val = 1
        for col in model[c]:
            val = val * norm(row[col], model[c][col]['mean'], model[c][col]['std'])
        if val > p:
            target = c
            p = val
    row['prediction'] = target
    row['accuracy'] = p
    return row


In [7]:
def predict(model, df: pd.DataFrame):
    return df.apply(lambda x: compute_model(model, x), axis=1, result_type='expand')

In [8]:
model = generate_bayesian_model(train, "stabf")

In [9]:
result = predict(model, test)

In [10]:
result

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf,prediction,accuracy
10,5.930110,6.730873,6.245138,0.533288,2.327092,-0.702501,-1.116920,-0.507671,0.239816,0.563110,0.164461,0.753701,-0.028411,stable,stable,1.297865e-05
23,5.973186,6.043118,5.996045,1.076940,3.828218,-0.989549,-1.079026,-1.759643,0.350475,0.128154,0.548861,0.503974,-0.023013,stable,stable,1.021933e-03
28,1.807819,7.020356,6.676929,5.169648,3.672288,-1.188471,-0.819164,-1.664653,0.127658,0.160127,0.171724,0.838404,-0.008638,stable,stable,6.939109e-05
31,3.432549,0.944068,3.324771,7.735356,4.134624,-1.495949,-1.398690,-1.239985,0.408974,0.392226,0.330909,0.655999,-0.022913,stable,stable,2.142907e-03
32,4.780713,8.453343,2.755188,4.249789,4.112105,-1.154158,-1.158403,-1.799544,0.063362,0.987507,0.791912,0.292634,0.021234,unstable,unstable,3.099247e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9976,7.063097,3.897790,3.667961,7.953410,4.632592,-1.146609,-1.850362,-1.635621,0.537054,0.464419,0.565545,0.164245,0.014090,unstable,unstable,1.551174e-04
9979,7.512814,9.616251,8.045889,2.200809,2.377325,-0.688485,-0.703106,-0.985733,0.118933,0.280216,0.932764,0.967001,0.018994,unstable,unstable,4.836866e-07
9980,4.141337,1.112868,8.400888,1.869624,3.671563,-0.699052,-1.513896,-1.458615,0.081811,0.094963,0.662959,0.169682,-0.032192,stable,stable,4.782983e-05
9987,3.176332,5.101491,8.134258,4.985496,3.031084,-0.957564,-1.102893,-0.970628,0.527206,0.774756,0.180754,0.209837,0.010610,unstable,unstable,1.443248e-04


In [11]:
result.where(result[model['target']] == 'stable').dropna().where(result[model['target']] == result["prediction"]).count()

tau1          728
tau2          728
tau3          728
tau4          728
p1            728
p2            728
p3            728
p4            728
g1            728
g2            728
g3            728
g4            728
stab          728
stabf         728
prediction    728
accuracy      728
dtype: int64

In [12]:
print("=== Detailed Accuracy By Class ===")
count = result['prediction'].count()
for c in model['classes']:

    print("Class: " + c)
    df_c = result.where(result[model['target']] == c).dropna()
    df_nc = result.where(result[model['target']] != c).dropna()
    tp_rate = df_c.where(result[model['target']] == result["prediction"])['prediction'].count() / df_c['prediction'].count()
    tn_rate = df_nc.where(result[model['target']] == result["prediction"])['prediction'].count() / df_nc['prediction'].count()
    fp_rate =  df_nc.where(c == result["prediction"])['prediction'].count() / df_nc['prediction'].count()
    fn_rate =  df_c.where(c != result["prediction"])['prediction'].count() / df_c['prediction'].count()
    precision = tp_rate / (tp_rate + fp_rate)
    recall = tp_rate / (tp_rate + fn_rate)
    f_score = 2 * (precision * recall) / (precision + recall)
    mcc = (tp_rate * tn_rate - fp_rate * fn_rate) / math.sqrt((tp_rate + fp_rate) * (tp_rate + fn_rate) * (tn_rate + fp_rate) * (tn_rate + fn_rate))
    print("TP Rate: " + str(tp_rate))
    print("FP Rate: " + str(fp_rate))
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("F-Score: " + str(f_score))
    print("MCC: " + str(mcc))


    print()

=== Detailed Accuracy By Class ===
Class: unstable
TP Rate: 0.958139534883721
FP Rate: 0.010869565217391304
Precision: 0.9887828037773255
Recall: 0.958139534883721
F-Score: 0.9732200169461062
MCC: 0.9477251936344797

Class: stable
TP Rate: 0.9891304347826086
FP Rate: 0.04186046511627907
Precision: 0.9593978325896141
Recall: 0.9891304347826086
F-Score: 0.9740372887262589
MCC: 0.9477251936344797



In [30]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB


y_train = train["stabf"]
y_test = test["stabf"]

x_train = train.drop(columns=["stabf"])
x_test= test.drop(columns=["stabf"])

gnb = GaussianNB()

y_pred = gnb.fit(x_train, y_train).predict(x_test)
print("Numberbof mislabeled points out of a total %d points : %d" % (x_test.shape[0], (y_test != y_pred).sum()))

Numberbof mislabeled points out of a total 2026 points : 46


In [34]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_pred, average=None)

(array([0.96495957, 0.98442368]),
 array([0.97282609, 0.97984496]),
 array([0.96887686, 0.98212898]),
 array([ 736, 1290], dtype=int64))