In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import pandas as pd
import collections
import numpy as np


In [3]:
data = collections.OrderedDict([('words', ['great was great', 'amazing go again' ,
                                           'great again' , 'was terrible not great']),
                                 ('class', ['p','p','p','n'])])
df = pd.DataFrame.from_dict(data)

In [4]:
df

Unnamed: 0,words,class
0,great was great,p
1,amazing go again,p
2,great again,p
3,was terrible not great,n


In [5]:
cv = CountVectorizer() 




# - creates a dictionary of words and the number of occurances in each entry

In [6]:
bow = cv.fit(df['words'])  

# - makes a dictionary out of the words that are in the data given.

In [7]:
print(bow.vocabulary_) 

{'great': 3, 'was': 6, 'amazing': 1, 'go': 2, 'again': 0, 'terrible': 5, 'not': 4}


In [8]:
bow.get_feature_names()

['again', 'amazing', 'go', 'great', 'not', 'terrible', 'was']

In [9]:
bow_transform = bow.transform(df['words']) # transform maps the frequency of words to the dictionary created
                                           # every entry is the whole dictionary if you want maped to the frequency


# - maps the frequency of words to the dictionary created

In [10]:
count_vect_df = pd.DataFrame(bow_transform.todense(), columns=bow.get_feature_names())

In [11]:
count_vect_df

Unnamed: 0,again,amazing,go,great,not,terrible,was
0,0,0,0,2,0,0,1
1,1,1,1,0,0,0,0
2,1,0,0,1,0,0,0
3,0,0,0,1,1,1,1


In [12]:
sum_df = pd.DataFrame(np.sum(bow_transform.todense(),axis=0), columns=cv.get_feature_names())

In [13]:
sum_df

Unnamed: 0,again,amazing,go,great,not,terrible,was
0,2,1,1,4,1,1,2


In [14]:
classifier_mnb = MultinomialNB().fit(bow_transform, df['class'])

In [15]:
# How the theory works 


\begin{equation*}
\text{Probability of specific word in a class} = 
\frac{{CountOfSpecificWordInClass}+1}{{Number of Words In Class + Vocabulary Size} }
\end{equation*}

\begin{equation*}
\text{Probability of Sentance In a given Class} = \text{Fraction of Entries in Class} * \text{Probability of word in a class}
\end{equation*}

# Factors which effect Naive Bayes Algorithm
    - Balance of Dataset 
    - Occurance of words in dataset
    - Vocabulary size


# 'great' in [p] class 
    - Count of Word in [P] Class = 3
    - Number of Words in [P] Class = 8
    - Vocabulary Size = 7 
    - (3+1)/(8+7) = 4/15 ~ 0.266


# 'great' in [n]class 
    - Count of great in [N] Class = 1
    - Number of Words in [N] Class = 4
    - Vocabulary Size = 7 
    - (1+1)/(4+7) = 2/11  ~ 0.1818

In [16]:
#################

# 'was' in [p] class 
    - Count of Word in [P] Class = 1
    - Number of Words in [P] Class = 8
    - Vocabulary Size = 7 
    - (1+1)/(8+7) = 2/15 ~ 0.133

# 'was' in [n] class 
    - Count of great in [N] Class = 1
    - Number of Words in [N] Class = 4
    - Vocabulary Size = 7 
    - (1+1)/(4+7) = 2/11  ~ 0.1818

# Probability of "was great" in [p] datasset

    - 3/4 * 4/15 * 2/15 = 0.02666666666

# Probability of "was great" in [n] dataset 
    -1/4 * 2/11 * 2/11 = 0.0082644628


In [17]:
bow_comments = bow.transform(['was great'])
predictions = classifier_mnb.predict(bow_comments)
print(predictions)
print(round(classifier_mnb.predict_proba(bow_comments)[0][1], 5))


['p']
0.76341


In [None]:
# Resources 
# https://www.youtube.com/watch?v=km2LoOpdB3A
# https://towardsdatascience.com/unfolding-naïve-bayes-from-scratch-2e86dcae4b01

# Extra Notes (not important)

In [None]:
# dataset size 
# depends on your problem, what you have to do is test and see if your accuracy is affected by the data size
# It basically depends on the complexity of your problem, for this is 2 classes so less data is needed. 
# the more complex the problem the more data is needed
# the more complex the algorithm the bigger the dataset that would be needed.

# Naive Bayes 
# dataset size should be at least 1000 for naive baiyes classifier
# Naive bayes algorithm works with small dataset 
# dataset has to be balanced - should use cross validation so that the data is not wasted
# https://www.researchgate.net/figure/Training-set-size-Performance-of-naive-Bayes-classifier-and-structure-with-parameters_fig5_6834727

# how to improve sentiment analysis
# add stop words to count vectorizer
# add tf-idf 
# use other algorithms (svm), nb & svm, rnn lstm
# https://blog.paralleldots.com/data-science/breakthrough-research-papers-and-models-for-sentiment-analysis/

# Other functionalies 

In [None]:
print(bow.vocabulary_) 

In [41]:
bow.get_feature_names()  #displays the names of the unique occurances in count vectorizer

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']

In [None]:
# count vectorizer sets the stop words to none. if you want to set it to put in stop words, you can put in the 
# CountVectorizer(stop_words='english')

In [32]:
bow_transform.shape # number of lines of input x dictionary size (number of unique words)

(4, 6)

In [44]:
print(bow_transform)

  (0, 0)	1
  (0, 1)	2
  (1, 1)	2
  (1, 4)	1
  (2, 1)	1
  (2, 3)	1
  (3, 1)	1
  (3, 2)	1
  (3, 5)	1


In [37]:
bow_transform.todense()