# Naive Bayes

In [1]:
# Naive bayes machine learning algorithm works on conditional probablity of independent variables in a dataset

In [2]:
# Pros:
#-> It is easy and fast to predict class of test data set. It also perform well in multi class prediction.
#-> When assumption of independence holds, a Naive Bayes classifier performs better 
# compare to other models like logistic regression and you need less training data.
#-> It perform well in case of categorical input variables compared to numerical variable(s). 
# For numerical variable, normal distribution is assumed (bell curve, which is a strong assumption).

In [3]:
# Cons:
#-> If categorical variable has a category (in test data set), which was not observed in training data set, 
# then model will assign a 0 (zero) probability and will be unable to make a prediction. 
# This is often known as “Zero Frequency”. To solve this, we can use the smoothing technique. 
# One of the simplest smoothing techniques is called Laplace estimation.
#-> On the other side naive Bayes is also known as a bad estimator, 
# so the probability outputs from predict_proba are not to be taken too seriously.
#-> Another limitation of Naive Bayes is the assumption of independent predictors. 
# In real life, it is almost impossible that we get a set of predictors which are completely independent.

In [4]:
# 4 Applications of Naive Bayes Algorithms
#-> Real time Prediction: Naive Bayes is an eager learning classifier and it is sure fast. 
# Thus, it could be used for making predictions in real time.
#-> Multi class Prediction: This algorithm is also well known for multi class prediction feature. 
# Here we can predict the probability of multiple classes of target variable.
#-> Text classification/ Spam Filtering/ Sentiment Analysis: Naive Bayes classifiers mostly used in text 
# classification (due to better result in multi class problems and independence rule) have 
# higher success rate as compared to other algorithms. As a result, it is widely 
# used in Spam filtering (identify spam e-mail) and Sentiment Analysis (in social media analysis, 
# to identify positive and negative customer sentiments)
#-> Recommendation System: Naive Bayes Classifier and Collaborative Filtering together 
# builds a Recommendation System that uses machine learning and data mining techniques to filter unseen 
# information and predict whether a user would like a given resource or not.

In [5]:
# There are three types of Naive Bayes model under the scikit-learn library:
#->  Gaussian: It is used in classification and it assumes that features follow a normal distribution.

#-> Multinomial: It is used for discrete counts. 
# For example, let’s say,  we have a text classification problem. 
# Here we can consider Bernoulli trials which is one step further and instead of “word occurring in the document”, 
# we have “count how often word occurs in the document”, you can think of 
# it as “number of times outcome number x_i is observed over the n trials”.

#-> Bernoulli: The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). 
# One application would be text classification with ‘bag of words’ model where 
# the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.

In [6]:
import pandas as pd

In [7]:
from sklearn.datasets import load_wine

In [8]:
wine = load_wine()

In [9]:
dir(wine)

['DESCR', 'data', 'feature_names', 'target', 'target_names']

In [41]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [10]:
df = pd.DataFrame(wine.data,columns = wine.feature_names) # input variables
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [11]:
target = wine.target #output variable

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df,target,test_size = 0.3)

In [14]:
len(X_train)

124

In [15]:
len(X_test)

54

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
alcohol                         178 non-null float64
malic_acid                      178 non-null float64
ash                             178 non-null float64
alcalinity_of_ash               178 non-null float64
magnesium                       178 non-null float64
total_phenols                   178 non-null float64
flavanoids                      178 non-null float64
nonflavanoid_phenols            178 non-null float64
proanthocyanins                 178 non-null float64
color_intensity                 178 non-null float64
hue                             178 non-null float64
od280/od315_of_diluted_wines    178 non-null float64
proline                         178 non-null float64
dtypes: float64(13)
memory usage: 18.2 KB


# Assuming Gaussian distribution 

In [34]:
# Gaussian distribution method is used where all the features or variables are continuous

In [19]:
from sklearn.naive_bayes import GaussianNB

In [20]:
model = GaussianNB()

In [21]:
model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
model.score(X_test,y_test)

0.9629629629629629

In [23]:
model.predict(X_test)

array([1, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 2, 1, 2, 0, 2, 0, 0, 0, 2, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 2, 2, 2, 0, 0, 1, 1, 2, 1, 1, 0, 1, 2, 2,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 2])

In [24]:
y_test

array([1, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 2, 1, 2, 0, 2, 0, 0, 0, 2, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 0, 1, 2, 2,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 2])

# Assuming Multinomial distribution

In [35]:
# Used when it is needed to find out the number of times a class of target feature has occured

In [36]:
from sklearn.naive_bayes import MultinomialNB

In [37]:
model2 = MultinomialNB()

In [38]:
model2.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
model2.score(X_test,y_test)

0.8888888888888888

In [40]:
# We see that gaussian classifier works better than multinomial classifier