# Machine learing Naive bayse model

This model is very useful in email spam detector and other probability approach problems

### Spam Data set

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'C:\Users\Ayush Yadav\Desktop\PYFe\Untitled Folder\Data\ML data\spam.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [6]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [7]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.Message, df.spam,test_size=0.2)

In [11]:
len(x_train)

4457

In [12]:
len(x_test)

1115

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
v=CountVectorizer()

In [17]:
x_train_count = v.fit_transform(x_train.values)

In [19]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
x_train_count.toarray().shape

(4457, 7712)

In [32]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [33]:
model.fit(x_train_count, y_train)

MultinomialNB()

In [34]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [35]:
x_test_count = v.transform(x_test)
model.score(x_test_count, y_test)

0.9838565022421525

### Sklearn pipeline 

Instead of making the array every time we can simply do it by sklearn library

In [39]:
from sklearn.pipeline import Pipeline
pl = Pipeline([
    ('vectorize', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [40]:
pl.fit(x_train,y_train)

Pipeline(steps=[('vectorize', CountVectorizer()), ('nb', MultinomialNB())])

In [41]:
pl.score(x_test,y_test)

0.9838565022421525

In [42]:
pl.predict(emails)

array([0, 1], dtype=int64)

### Wine dataset

In [44]:
from sklearn.datasets import load_wine

In [45]:
wine = load_wine()

In [46]:
dir(wine)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [47]:
wine_df = pd.DataFrame(wine.data, columns= wine.feature_names)

In [48]:
wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [50]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [51]:
wine_df['target'] = wine.target

In [52]:
wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [53]:
from sklearn.model_selection import train_test_split

In [59]:
x_train,x_test,y_train,y_test = train_test_split(wine_df.drop('target',axis='columns'), wine_df['target'], test_size=0.2)

In [60]:
len(x_train)

142

In [61]:
len(x_test)

36

In [62]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [63]:
model_gauss = GaussianNB()

In [64]:
model_gauss.fit(x_train,y_train)

GaussianNB()

In [65]:
model_gauss.score(x_test, y_test)

0.9444444444444444

In [66]:
model_nb = MultinomialNB()
model_nb.fit(x_train,y_train)
model_nb.score(x_test,y_test)

0.6944444444444444