In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [3]:
cancer = load_breast_cancer()
all_features = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
all_target = pd.Series(cancer.target, name="diagnosis")
dataset = pd.concat([all_features.iloc[:, 0:4], all_target], axis=1)
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=41)
train_features = train_data.iloc[:, :-1]
train_target = train_data.iloc[:, -1]
test_features = test_data.iloc[:, :-1]
test_target = test_data.iloc[:, -1]

In [48]:
labels_ = train_target.unique()
for label in labels_:
    _mean = train_features[label==train_target]
    print(label)
    print(_mean.mean())

1
mean radius        12.158343
mean texture       18.015901
mean perimeter     78.143746
mean area         463.924382
dtype: float64
0
mean radius        17.245523
mean texture       21.459767
mean perimeter    113.842616
mean area         952.555814
dtype: float64


In [4]:
from sklearn import datasets
X, y = datasets.make_classification(n_samples = 1000, n_features = 10, n_classes = 2, random_state = 123)
X

array([[ 0.24063119, -0.07970884, -0.05313268, ..., -0.09709308,
         0.06994683,  0.11660277],
       [ 0.75425016, -0.937854  ,  0.21947276, ...,  0.19311463,
        -2.27886416,  0.65102942],
       [ 0.9584009 , -1.31841143,  1.15350536, ...,  0.0423321 ,
         0.79249125,  0.24144309],
       ...,
       [ 0.72025002,  1.47831033, -1.28082896, ...,  1.68993071,
        -1.83623101,  0.11659189],
       [ 0.26260661, -0.83670259,  0.61205712, ..., -1.4899733 ,
         0.91494036,  1.58572634],
       [-0.10781378, -0.94865427,  0.17073559, ...,  0.24775547,
         1.99581849, -0.84668649]])

In [30]:
X[0==0].shape

(1, 1000, 10)

In [27]:
for c in np.unique(y):
    X_c = X[c==y]
    print(X_c.shape)


(502, 10)
(498, 10)


In [5]:
np.unique(y)

array([0, 1])

In [13]:
dict((label,1) for label in np.unique(y))

{0: 1, 1: 1}

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
import math
from scipy.stats import norm
import matplotlib.mlab as mlab
import matplotlib.patches as mpatches
from scipy.optimize import curve_fit
from sklearn.metrics import confusion_matrix, f1_score
from sklearn import metrics
import urllib
from sklearn.naive_bayes import BernoulliNB,GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
data = pd.DataFrame(data=cancer.data, columns=cancer.feature_names) # data from library

In [16]:
diagnosis = pd.Series(cancer.target, name= "diagnosis")
type(diagnosis.unique())

numpy.ndarray

In [5]:
dataset = pd.concat([data.iloc[:,0:4], diagnosis], axis=1)

In [33]:
labels = diagnosis.unique()
labels

array([0, 1])

In [37]:
dataset.head(20)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,diagnosis
0,17.99,10.38,122.8,1001.0,0
1,20.57,17.77,132.9,1326.0,0
2,19.69,21.25,130.0,1203.0,0
3,11.42,20.38,77.58,386.1,0
4,20.29,14.34,135.1,1297.0,0
5,12.45,15.7,82.57,477.1,0
6,18.25,19.98,119.6,1040.0,0
7,13.71,20.83,90.2,577.9,0
8,13.0,21.82,87.5,519.8,0
9,12.46,24.04,83.97,475.9,0


In [39]:
for label in labels:   
    data_label = dataset[label == 1]

KeyError: False

In [12]:
for col in dataset:
    print(dataset[col])

0      17.99
1      20.57
2      19.69
3      11.42
4      20.29
       ...  
564    21.56
565    20.13
566    16.60
567    20.60
568     7.76
Name: mean radius, Length: 569, dtype: float64
0      10.38
1      17.77
2      21.25
3      20.38
4      14.34
       ...  
564    22.39
565    28.25
566    28.08
567    29.33
568    24.54
Name: mean texture, Length: 569, dtype: float64
0      122.80
1      132.90
2      130.00
3       77.58
4      135.10
        ...  
564    142.00
565    131.20
566    108.30
567    140.10
568     47.92
Name: mean perimeter, Length: 569, dtype: float64
0      1001.0
1      1326.0
2      1203.0
3       386.1
4      1297.0
        ...  
564    1479.0
565    1261.0
566     858.1
567    1265.0
568     181.0
Name: mean area, Length: 569, dtype: float64
0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: diagnosis, Length: 569, dtype: int64


# Likelihood

In [8]:
def gaussian_distribution_from_single_datapoint(x: float, mu: float, sigma: float) -> float:
    return (1 / np.sqrt(2 * np.pi * sigma**2)) * np.exp(
        -((x - mu) ** 2) / (2 * sigma**2)
    )

In [9]:
gaussian_distribution_from_single_datapoint(dataset.iloc[3,0], dataset.iloc[:,0].mean(), dataset.iloc[:,0].var())*100

3.136944078937883

In [10]:
def gaussian_distribution(feature: pd.Series) -> pd.Series:
    mu:float = feature.mean()
    sigma_squared:float = feature.var()
    p: pd.Series = (1 / np.sqrt(2 * np.pi * sigma_squared)) * np.exp(
        -((feature - mu) ** 2) / (2 * sigma_squared)
    )
    p.name = "gaussian probability distribution"
    return p
gaussian_distribution(dataset.iloc[:,0])

0      0.062084
1      0.021286
2      0.032570
3      0.084278
4      0.024536
         ...   
564    0.012243
565    0.026536
566    0.088503
567    0.020956
568    0.022130
Name: gaussian probability distribution, Length: 569, dtype: float64