# Bajesova formula

$$ P(y|X) = \frac{P(X|y) * P(y)}{P(y)} $$

**Curse of dimensionality** Kada imamo mnogo atributa (npr 100) udaljenosti izmedju samih instanci postaju besmislene jer su uvek blizu po nekom atributu

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('baloons.csv')
df.head()

Unnamed: 0,color,size,act,age,inflated
0,YELLOW,SMALL,STRETCH,ADULT,T
1,YELLOW,SMALL,STRETCH,ADULT,T
2,YELLOW,SMALL,STRETCH,CHILD,F
3,YELLOW,SMALL,DIP,ADULT,F
4,YELLOW,SMALL,DIP,CHILD,F


In [3]:
df.describe()

Unnamed: 0,color,size,act,age,inflated
count,76,76,76,76,76
unique,2,2,2,2,2
top,YELLOW,SMALL,STRETCH,ADULT,F
freq,40,40,38,38,41


In [4]:
X = df[df.columns[:-1]]
X.shape

(76, 4)

In [5]:
y = df.iloc[:,-1]
y.shape

(76,)

In [6]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=13, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(50, 4)
(26, 4)
(50,)
(26,)


In [10]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OrdinalEncoder

In [11]:
oe = OrdinalEncoder()
oe.fit(X_train)
print(oe.categories_)
X_train = oe.transform(X_train)
X_test = oe.transform(X_test)
model = CategoricalNB()
model.fit(X_train, y_train)

[array(['PURPLE', 'YELLOW'], dtype=object), array(['LARGE', 'SMALL'], dtype=object), array(['DIP', 'STRETCH'], dtype=object), array(['ADULT', 'CHILD'], dtype=object)]


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,
,min_categories,


In [12]:
y_train_pred = model.predict(X_train)

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [15]:
model = CategoricalNB()
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,
,min_categories,


In [16]:
y_test_pred = model.predict(X_test)

In [17]:
accuracy_score(y_test, y_test_pred)

0.8461538461538461

In [18]:
confusion_matrix(y_test, y_test_pred)

array([[12,  2],
       [ 2, 10]])

In [21]:
from sklearn.pipeline import Pipeline
# (transformation_1, transformation_2, ..., transformation_n, classifier)
# transformacije su objekti koji imaju fit i transform

In [23]:
pipe = Pipeline([('ordinal encoder', OrdinalEncoder()), ('classifier', CategoricalNB())])

In [24]:
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('ordinal encoder', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,
,min_categories,


In [25]:
pipe['ordinal encoder'].categories_

[array([0., 1.]), array([0., 1.]), array([0., 1.]), array([0., 1.])]

# Klasifikacija clanaka (TF matrica)

nemam fajl zamislimo da ga imamo

In [26]:
corpus = ['Chinese Beijing Chinese',
          'Chinese Chinese Shanghai',
          'Chinese Macao',
          'Tokyo Japan Chinese']
classes = ['yes', 'yes', 'yes', 'no']

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
vectorizer = CountVectorizer()

In [29]:
vectorizer.fit(corpus)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"


In [30]:
X_train = vectorizer.transform(corpus)
type(X_train)

scipy.sparse._csr.csr_matrix

In [31]:
X_train.toarray()

array([[1, 2, 0, 0, 0, 0],
       [0, 2, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 1]])

In [32]:
vectorizer.get_feature_names_out()

array(['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo'],
      dtype=object)

In [33]:
X_train = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out())

In [34]:
X_train

Unnamed: 0,beijing,chinese,japan,macao,shanghai,tokyo
0,1,2,0,0,0,0
1,0,2,0,0,1,0
2,0,1,0,1,0,0
3,0,1,1,0,0,1


In [35]:
from sklearn.naive_bayes import MultinomialNB

In [36]:
model = MultinomialNB()

In [37]:
model.fit(X_train, classes)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [38]:
model.classes_

array(['no', 'yes'], dtype='<U3')

In [39]:
test_doc = 'Chinese Chinese Chinese Tokyo Japan'

In [40]:
X_test = vectorizer.transform([test_doc])
X_test.toarray()

array([[0, 3, 1, 0, 0, 1]])

In [41]:
model.predict(X_test)



array(['yes'], dtype='<U3')

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [43]:
vectorizer = TfidfVectorizer()

In [44]:
vectorizer.fit(corpus)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [45]:
X_train = vectorizer.transform(corpus)

In [46]:
X_train.toarray()

array([[0.69183461, 0.722056  , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.722056  , 0.        , 0.        , 0.69183461,
        0.        ],
       [0.        , 0.46263733, 0.        , 0.88654763, 0.        ,
        0.        ],
       [0.        , 0.34618161, 0.66338461, 0.        , 0.        ,
        0.66338461]])

In [47]:
vectorizer.get_feature_names_out()

array(['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo'],
      dtype=object)