# Bernoulli Naive Bayes 

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

In [2]:
data=pd.read_csv("SMSSpamCollection.csv")

In [3]:
data.head()

Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
x=np.array(data['sms'])
y=np.array(data['Class'])

In [5]:
cv=CountVectorizer()
x=cv.fit_transform(x)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.3, random_state= 42)

In [7]:
model=BernoulliNB()

In [8]:
model.fit(X_train, y_train)

In [9]:
print(model.score(X_test, y_test))

0.9814593301435407


# Multinomial Naive Bayes

In [10]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

In [11]:
data=pd.read_csv("google play store apps reviews training.csv", encoding = 'latin1')

In [26]:
data.isnull().sum()

review      0
polarity    0
dtype: int64

In [25]:
data=data.dropna()

In [12]:

data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offlin...,0.0
1,com.facebook.katana,"messenger issues ever since the last update, i...",0.0
2,com.facebook.katana,profile any time my wife or anybody has more t...,0.0
3,com.facebook.katana,the new features suck for those of us who don'...,0.0
4,com.facebook.katana,forced reload on uploading pic on replying com...,0.0


In [13]:
data['polarity'].value_counts()

0.0    584
1.0    306
Name: polarity, dtype: int64

In [14]:
def preprocess_data(data):
    data=data.drop('package_name', axis=1)
    data['review']=data['review'].str.strip().str.lower()
    return data

In [15]:
data=preprocess_data(data)

In [16]:
data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0.0
1,"messenger issues ever since the last update, i...",0.0
2,profile any time my wife or anybody has more t...,0.0
3,the new features suck for those of us who don'...,0.0
4,forced reload on uploading pic on replying com...,0.0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1.0
887,all time legendary game the birthday party lev...,1.0
888,ads are way to heavy listen to the bad reviews...,0.0
889,fun works perfectly well. ads aren't as annoyi...,1.0


In [27]:
x=data['review']
y=data['polarity']
x, x_test, y, y_test = train_test_split(x, y,stratify=y, test_size= 0.25, random_state= 42)

In [28]:
vec=CountVectorizer(stop_words='english')
x=vec.fit_transform(x).toarray()
x_test= vec.transform(x_test).toarray()

In [29]:
from sklearn.naive_bayes import MultinomialNB

model=MultinomialNB()
model.fit(x, y)

In [30]:
model.score(x_test, y_test)

0.852017937219731

# Gaussian Naive Bayes

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [33]:
dataset=pd.read_csv('data.csv')

In [35]:
dataset.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [36]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [38]:
dataset=dataset.drop(columns=['id','Unnamed: 32'], axis=1)

In [39]:
M = dataset[dataset.diagnosis =='M']
B = dataset[dataset.diagnosis =='B']

In [41]:
for i in dataset.diagnosis:
    if i=="M":
        i==1
    else:
        i==0

In [42]:
x=dataset.drop(["diagnosis"], axis=1)
y=dataset.diagnosis.values

In [43]:
y

array(['M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B',
       'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'B', 'B',
       'M', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B',
       'B', 'M', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M',
       'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'B',
       'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'M', 'M',
       'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'M

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [51]:
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()

In [52]:
scaled_train=ms.fit_transform(X_train)
scaled_test=ms.transform(X_test)

In [53]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model.fit(x, y)

In [54]:
model.score(X_test, y_test)

0.9473684210526315