# Naive Bayes Classifier

In [3]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import re

In [4]:
train_data=pd.read_csv('traindata.csv')[['category','text']]

In [5]:
test_data=pd.read_csv('testdata.csv')[['category','text']]

In [6]:
stopword=set(stopwords.words('english'))
porter=PorterStemmer()

In [12]:
#Helper Functions
# Used Porter Stemmer
def clean(sent):
    retr=[]
    for word in word_tokenize(sent):
        if word not in stopword and word.isalpha():
            retr.append(porter.stem(word))
#            retr.append(word)
    return retr
def process(data,task='train'):
    retr=dict()
    allsent=[]
    for i in range(len(data)):
        category,sentence=data.iloc[i,0],data.iloc[i,1]
        if category not in retr:
            retr[category]=[]
        cleaned=clean(re.sub('[^\w\s]',' ',sentence.lower()))
        retr[category].append(cleaned)
        allsent.append(cleaned)
    return retr,allsent
def vectorize(sent,vocab):
    vector=np.zeros(len(vocab))
    for word in sent:
        vector[vocab[word]]+=1
    return vector

# Creating Vocabulary- Question 1

In [13]:
train,train_sent=process(train_data,'train')
vocab={}
n=0
for l in train_sent:
    for word in l:
        if word not in vocab:
            vocab[word]=n
            n+=1
print('Vocabulary of words in Train data \n',vocab)

Vocabulary of words in Train data 
 {'outer': 0, 'space': 1, 'friendli': 2, 'life': 3, 'extrem': 4, 'temperatur': 5, 'low': 6, 'pressur': 7, 'radiat': 8, 'quickli': 9, 'degrad': 10, 'cell': 11, 'membran': 12, 'destroy': 13, 'dna': 14, 'tenni': 15, 'origin': 16, 'name': 17, 'lawn': 18, 'game': 19, 'two': 20, 'oppos': 21, 'player': 22, 'singl': 23, 'pair': 24, 'doubl': 25, 'use': 26, 'tautli': 27, 'strung': 28, 'racket': 29, 'hit': 30, 'ball': 31, 'specifi': 32, 'size': 33, 'weight': 34, 'bounc': 35, 'net': 36, 'rectangular': 37, 'court': 38, 'one': 39, 'woman': 40, 'frequent': 41, 'flew': 42, 'southwest': 43, 'constantli': 44, 'disappoint': 45, 'everi': 46, 'aspect': 47, 'compani': 48, 'oper': 49, 'fact': 50, 'becam': 51, 'known': 52, 'pen': 53, 'pal': 54, 'flight': 55, 'wrote': 56, 'complaint': 57, 'decemb': 58, 'almost': 59, 'seven': 60, 'year': 61, 'mer': 62, 'outbreak': 63, 'novel': 64, 'coronaviru': 65, 'ncov': 66, 'surfac': 67, 'wuhan': 68, 'hubei': 69, 'region': 70, 'china': 71, 

# Create Prior Distribution of Each label - Question 2 

In [14]:
pditri={cat:len(train[cat])/len(train_sent) for cat in train}
print('Prior Distribution of Labels:\n',pditri)

Prior Distribution of Labels:
 {'science': 0.25, 'sports': 0.25, 'business': 0.2375, 'covid': 0.2625}


# Create Conditional Distribution of each word - Question 3

In [15]:
n_train=dict()
for cat in train:
    n_train[cat]=np.zeros(n)
    for sent in train[cat]:
        n_train[cat]+=vectorize(sent,vocab)
table=pd.DataFrame(n_train,index=[i for i,j in sorted(vocab.items(),key=lambda pair:pair[1])])
table=table.replace(0,0.1)
table=table/table.sum()
print('Conditional Distribution for each word\n',table)

Conditional Distribution for each word
               science    sports  business     covid
outer        0.011839  0.002144  0.000266  0.003274
space        0.039463  0.000214  0.000266  0.000327
friendli     0.003946  0.000214  0.000266  0.000327
life         0.015785  0.000214  0.000266  0.000327
extrem       0.011839  0.000214  0.000266  0.000327
temperatur   0.003946  0.000214  0.000266  0.000327
low          0.003946  0.000214  0.000266  0.000327
pressur      0.003946  0.002144  0.000266  0.003274
radiat       0.015785  0.000214  0.000266  0.000327
quickli      0.003946  0.000214  0.000266  0.000327
degrad       0.003946  0.000214  0.000266  0.000327
cell         0.007893  0.000214  0.000266  0.003274
membran      0.003946  0.000214  0.000266  0.000327
destroy      0.003946  0.000214  0.000266  0.000327
dna          0.007893  0.000214  0.000266  0.000327
tenni        0.000395  0.036449  0.000266  0.000327
origin       0.000395  0.010720  0.002656  0.016372
name         0.000395  0

# Apply Naive-Bayes Model on Test set

In [16]:
pd.set_option('display.float_format', lambda x: '%.9f' % x)
_,test=process(test_data,'test')
distri=[]
for sent in test:
    temp=dict(pditri)
    for word in sent:
        if word not in vocab:
            continue
        for clas in temp:
            temp[clas]*=table.loc[word,clas]
    k=sum(temp.values())
    for clas in temp:
        temp[clas]/=k
    distri.append(temp)
final=pd.DataFrame(distri)
print('Index Represent i th sentence in test set')
print('Posterior Distribution of each sentence over different labels')
print(final)

Index Represent i th sentence in test set
Posterior Distribution of each sentence over different labels
      business       covid     science      sports
0  0.000000000 0.000000003 0.999999997 0.000000000
1  0.062482240 0.000276345 0.937240822 0.000000593
2  0.000024762 0.000000592 0.999974642 0.000000003
3  0.000000000 0.000000000 1.000000000 0.000000000
4  0.000702233 0.029084731 0.969824190 0.000388846
5  0.000000000 0.000000000 0.000000000 1.000000000
6  0.000000000 0.000000000 0.000000000 1.000000000
7  0.000000000 0.000000000 0.000000000 1.000000000
8  0.000000505 0.000001959 0.000114336 0.999883201
9  0.000000233 0.000000000 0.000000000 0.999999767
10 1.000000000 0.000000000 0.000000000 0.000000000
11 0.999993411 0.000000067 0.000006514 0.000000008
12 0.977089618 0.008206576 0.011352658 0.003351149
13 0.999844490 0.000127630 0.000025646 0.000002235
14 1.000000000 0.000000000 0.000000000 0.000000000
15 0.000000001 0.999999728 0.000000236 0.000000036
16 0.000000001 0.999999755 0.

for each sentence in testset the label which has high probability is the predicted class.

 ## Prediction of last word

In [17]:
# To load dataset
train_data=pd.read_csv('40.csv')
test_data=pd.read_csv('10.csv')
stopword=set(stopwords.words('english'))
porter=PorterStemmer()

In [18]:
#Helper Functions
# Used Porter Stemmer
def clean(sent):
    retr=[]
    for word in word_tokenize(sent):
        if word not in stopword and word.isalpha():
            retr.append(porter.stem(word))
#            retr.append(word)
    return retr
def process(data,task='train'):
    allsent=[]
    retr=dict()
    for i in range(len(data)):
        sentence=data.iloc[i,0]
        cleaned=clean(re.sub('[^\w\s]',' ',sentence.lower()))
        if cleaned[-1] not in retr:
            retr[cleaned[-1]]=[]
        retr[cleaned[-1]].append(cleaned[:-1])
        allsent.append(cleaned)
    return retr,allsent
def vectorize(sent,vocab):
    vector=np.zeros(len(vocab))
    for word in sent:
        vector[vocab[word]]+=1
    return vector

# Creating Vocabulary- Question 1

In [19]:
train,train_sent=process(train_data,'train')
vocab={}
n=0
for l in train_sent:
    for word in l:
        if word not in vocab:
            vocab[word]=n
            n+=1
print('Vocabulary of words in Train data \n',vocab)

Vocabulary of words in Train data 
 {'midst': 0, 'covid': 1, 'pandem': 2, 'eat': 3, 'healthi': 4, 'food': 5, 'remain': 6, 'import': 7, 'part': 8, 'maintain': 9, 'health': 10, 'specif': 11, 'help': 12, 'protect': 13, 'viru': 14, 'nutriti': 15, 'diet': 16, 'boost': 17, 'immun': 18, 'system': 19, 'fight': 20, 'symptom': 21, 'may': 22, 'abl': 23, 'share': 24, 'meal': 25, 'friend': 26, 'love': 27, 'one': 28, 'lot': 29, 'way': 30, 'well': 31, 'support': 32, 'difficult': 33, 'time': 34, 'strict': 35, 'limit': 36, 'stay': 37, 'unrealist': 38, 'thin': 39, 'depriv': 40, 'rather': 41, 'feel': 42, 'great': 43, 'energi': 44, 'improv': 45, 'mood': 46, 'overli': 47, 'complic': 48, 'overwhelm': 49, 'conflict': 50, 'nutrit': 51, 'advic': 52, 'alon': 53, 'need': 54, 'balanc': 55, 'protein': 56, 'fat': 57, 'carbohydr': 58, 'fiber': 59, 'vitamin': 60, 'miner': 61, 'sustain': 62, 'bodi': 63, 'give': 64, 'get': 65, 'go': 66, 'keep': 67, 'also': 68, 'cognit': 69, 'function': 70, 'much': 71, 'harm': 72, 'peop

# Create Prior Distribution of Each Word in Vocabulary - Question 2

In [20]:
pditri=dict()
total=0
for word in vocab:
    if word not in train:
        train[word]=[]
for cat in train:
    pditri[cat]=max(len(train[cat]),0.001)
    total+=len(train[cat])
for token in pditri:
    pditri[token]/=total
print('Prior Distribution of Labels:\n',pditri)

Prior Distribution of Labels:
 {'health': 0.075, 'symptom': 0.025, 'time': 0.025, 'mood': 0.025, 'alon': 0.025, 'bodi': 0.025, 'function': 0.025, 'age': 0.025, 'heart': 0.025, 'waistlin': 0.05, 'weight': 0.025, 'difficulti': 0.025, 'job': 0.025, 'carb': 0.025, 'proposit': 0.025, 'plan': 0.025, 'day': 0.025, 'choic': 0.05, 'food': 0.05, 'anxieti': 0.025, 'altern': 0.025, 'healthi': 0.025, 'meal': 0.025, 'energi': 0.025, 'headach': 0.025, 'need': 0.025, 'stuf': 0.025, 'love': 0.025, 'pizza': 0.025, 'over': 0.025, 'boredom': 0.025, 'fiber': 0.025, 'sugar': 0.025, 'onion': 0.025, 'dish': 0.025, 'midst': 2.5e-05, 'covid': 2.5e-05, 'pandem': 2.5e-05, 'eat': 2.5e-05, 'remain': 2.5e-05, 'import': 2.5e-05, 'part': 2.5e-05, 'maintain': 2.5e-05, 'specif': 2.5e-05, 'help': 2.5e-05, 'protect': 2.5e-05, 'viru': 2.5e-05, 'nutriti': 2.5e-05, 'diet': 2.5e-05, 'boost': 2.5e-05, 'immun': 2.5e-05, 'system': 2.5e-05, 'fight': 2.5e-05, 'may': 2.5e-05, 'abl': 2.5e-05, 'share': 2.5e-05, 'friend': 2.5e-05, 'on

# Create Conditional Distribution of each Word - Question 3

In [21]:
n_train=dict()
for cat in train:
    n_train[cat]=np.zeros(n)
    for sent in train[cat]:
        n_train[cat]+=vectorize(sent,vocab)
table=pd.DataFrame(n_train,index=[i for i,j in sorted(vocab.items(),key=lambda pair:pair[1])])
#print(table.sum(),table.describe())
table=table.replace(0,0.001)
table=table/table.sum()
print('Conditional Distribution for each word\n',table)

Conditional Distribution for each word
               health     symptom        time        mood        alon  \
midst    0.031950923 0.000081208 0.000069876 0.000054624 0.000096946   
covid    0.031950923 0.000081208 0.000069876 0.000054624 0.000096946   
pandem   0.031950923 0.000081208 0.000069876 0.000054624 0.000096946   
eat      0.031950923 0.000081208 0.069876319 0.054623914 0.096946195   
healthi  0.095852770 0.000081208 0.000069876 0.054623914 0.096946195   
food     0.031950923 0.081208381 0.000069876 0.054623914 0.000096946   
remain   0.031950923 0.000081208 0.000069876 0.000054624 0.000096946   
import   0.031950923 0.000081208 0.000069876 0.000054624 0.000096946   
part     0.031950923 0.000081208 0.000069876 0.000054624 0.000096946   
maintain 0.031950923 0.000081208 0.000069876 0.000054624 0.000096946   
health   0.000031951 0.000081208 0.069876319 0.054623914 0.000096946   
specif   0.000031951 0.081208381 0.000069876 0.000054624 0.000096946   
help     0.000031951 0.1

# Apply Naive-Bayes Model on Test set-- Question 4

In [22]:
pd.set_option('display.float_format', lambda x: '%.9f' % x)
_,test=process(test_data,'test')
distri=[]
for sent in test:
    temp=dict(pditri)
    for word in sent:
        if word not in vocab:
            continue
        for clas in temp:
            temp[clas]*=table.loc[word,clas]
    k=sum(temp.values())
    for clas in temp:
        temp[clas]/=k
    distri.append(temp)
final=pd.DataFrame(distri,index=test_data['text'])
predict=pd.concat([final.idxmax(axis=1),final.max(axis=1)],axis=1)
predict.columns=['Word','Probability']
predict

Unnamed: 0_level_0,Word,Probability
text,Unnamed: 1_level_1,Unnamed: 2_level_1
Eating healthy food is important for maintainng good _____,health,0.920073115
Following a healthy diet will boost your _____,mood,0.606130712
"Avoid eating chemical additives, added sugars in your diet. Switch to a healthy ____",anxieti,0.987761419
"Your diet should be rich of vitamins D,K, calcium and ______",job,0.99999992
"The healthier the food you eat, the better you’ll feel after a _____.",meal,0.999999889
"Dehydration causes tiredness, low energy and head aches. Drink plenty of ______",headach,0.999999999
People with kidney disease should avoid eating high amounts of ______,age,0.811337231
We should avoid eating transfats. Eating healthy fats and dietary fibre can help us lose ____,weight,0.061582415
Mindless eating is often caused by eating alone. We should avoid eating while we are in front of a TV or a ______,over,1.0
Eating more junk food will make you feel uncomfortable and _____,energi,0.999985304
