# Naive Bayers

In [1]:
import pandas as pd

In [8]:
df = pd.read_csv('./balloons.csv')

In [9]:
df.describe()

Unnamed: 0,color,size,act,age,inflated
count,76,76,76,76,76
unique,2,2,2,2,2
top,YELLOW,SMALL,STRETCH,ADULT,F
freq,40,40,38,38,41


In [7]:
X = df[df.columns[:-1]]
X.shape

(76, 4)

In [10]:
y = df.iloc[:, -1]
y.shape

(76,)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42, stratify=y)

In [14]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(50, 4)
(50,)
(26, 4)
(26,)


In [15]:
from sklearn.naive_bayes import CategoricalNB

In [16]:
model = CategoricalNB()

In [17]:
from sklearn.preprocessing import OrdinalEncoder

In [18]:
encoder = OrdinalEncoder()

In [19]:
encoder.fit(X_train)

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [20]:
encoder.categories_

[array(['PURPLE', 'YELLOW'], dtype=object),
 array(['LARGE', 'SMALL'], dtype=object),
 array(['DIP', 'STRETCH'], dtype=object),
 array(['ADULT', 'CHILD'], dtype=object)]

In [21]:
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [22]:
model = CategoricalNB()

In [23]:
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,
,min_categories,


In [24]:
y_train_pred = model.predict(X_train)

In [25]:
from sklearn.metrics import confusion_matrix

In [26]:
confusion_matrix(y_train, y_train_pred)

array([[21,  6],
       [ 4, 19]])

In [27]:
y_test_pred = model.predict(X_test)

In [28]:
confusion_matrix(y_test, y_test_pred)

array([[9, 5],
       [3, 9]])

In [29]:
from sklearn.pipeline import Pipeline
# (transofrmation_1, transofrmation_2, ..., transofrmation_n, classifier)

In [31]:
pipe = Pipeline([('ordinal encoder', OrdinalEncoder()), ('classifier', CategoricalNB())])

In [32]:
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('ordinal encoder', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,
,min_categories,


In [34]:
pipe['ordinal encoder'].categories_

[array([0., 1.]), array([0., 1.]), array([0., 1.]), array([0., 1.])]

In [36]:
pipe['classifier'].class_count_

array([27., 23.])

In [37]:
corpus = ['Chinese Beijing chinese',
          'Chinese Chinese Shanghai',
          'Chinese Macao',
          'Tokyo Japan Chinese']
classes = ['yes', 'yes', 'yes', 'no']

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
vectorizer = CountVectorizer()

In [40]:
vectorizer.fit(corpus)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"


In [41]:
vectorizer.fit(corpus)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"


In [42]:
X_train = vectorizer.transform(corpus)

In [43]:
type(X_train)

scipy.sparse._csr.csr_matrix

In [44]:
X_train.toarray()

array([[1, 2, 0, 0, 0, 0],
       [0, 2, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 1]])

In [46]:
vectorizer.get_feature_names_out()

array(['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo'],
      dtype=object)

In [48]:
X_train = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out())

In [49]:
X_train

Unnamed: 0,beijing,chinese,japan,macao,shanghai,tokyo
0,1,2,0,0,0,0
1,0,2,0,0,1,0
2,0,1,0,1,0,0
3,0,1,1,0,0,1


In [52]:
from sklearn.naive_bayes import MultinomialNB

In [53]:
model = MultinomialNB()

In [54]:
model.fit(X_train, classes)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [55]:
model.classes_

array(['no', 'yes'], dtype='<U3')

In [56]:
test_doc = 'Chinese Chinese Chinese Tokyo Japan'

In [58]:
X_test = vectorizer.transform([test_doc])
X_test.toarray()

array([[0, 3, 1, 0, 0, 1]])

In [59]:
model.predict(X_test)



array(['yes'], dtype='<U3')