In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv('traindata.csv')
#Creating a list of sentences
sentences = dataset.iloc[:,0:2].values
sentences

array([['science',
        'Outer space is not friendly to life. Extreme temperatures, low pressure and radiation can quickly degrade cell membranes and destroy DNA.'],
       ['sports',
        'Tennis, original name\xa0lawn tennis, game in which two opposing players (singles) or pairs of players (doubles) use tautly strung\xa0rackets\xa0to hit a\xa0ball\xa0of specified size, weight, and bounce over a\xa0net\xa0on a rectangular court.\xa0'],
       ['business',
        'One woman who frequently flew on Southwest was constantly disappointed with every aspect of the company’s operation. In fact, she became known as the “Pen Pal” because after every flight she wrote in with a complaint.'],
       ['covid',
        'In December 2019, almost seven years after the MERS 2012 outbreak, a novel Coronavirus (2019-nCoV) surfaced in Wuhan in the Hubei region of China.'],
       ['science',
        'Any life-forms that somehow find themselves in the void soon die.\xa0Unless they band together.'],


In [3]:
#Removing punctuations from each word
import string
table = str.maketrans('', '', string.punctuation)
bag_of_words = []
for sentence in sentences[:, 1]:
    words = [word.translate(table) for word in sentence.split()]
    bag_of_words.append(words)
print('All the words are now free from punctuation marks!!!')
bag_of_words

All the words are now free from punctuation marks!!!


[['Outer',
  'space',
  'is',
  'not',
  'friendly',
  'to',
  'life',
  'Extreme',
  'temperatures',
  'low',
  'pressure',
  'and',
  'radiation',
  'can',
  'quickly',
  'degrade',
  'cell',
  'membranes',
  'and',
  'destroy',
  'DNA'],
 ['Tennis',
  'original',
  'name',
  'lawn',
  'tennis',
  'game',
  'in',
  'which',
  'two',
  'opposing',
  'players',
  'singles',
  'or',
  'pairs',
  'of',
  'players',
  'doubles',
  'use',
  'tautly',
  'strung',
  'rackets',
  'to',
  'hit',
  'a',
  'ball',
  'of',
  'specified',
  'size',
  'weight',
  'and',
  'bounce',
  'over',
  'a',
  'net',
  'on',
  'a',
  'rectangular',
  'court'],
 ['One',
  'woman',
  'who',
  'frequently',
  'flew',
  'on',
  'Southwest',
  'was',
  'constantly',
  'disappointed',
  'with',
  'every',
  'aspect',
  'of',
  'the',
  'company’s',
  'operation',
  'In',
  'fact',
  'she',
  'became',
  'known',
  'as',
  'the',
  '“Pen',
  'Pal”',
  'because',
  'after',
  'every',
  'flight',
  'she',
  'wrote',

In [4]:
#Coverting all the words to lower case
bag_of_words = [[word.lower() for word in example] for example in bag_of_words]
print("All the words have been converted to lower case!!!")

All the words have been converted to lower case!!!


In [5]:
#Removing stopwords from the file
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
bag_of_words = [[word for word in example if word not in stop_words] for example in bag_of_words]
print("All the stop-words have been removed!!!")


All the stop-words have been removed!!!


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
bag_of_words = [[ps.stem(word) for word in example] for example in bag_of_words]
sentences

array([['science',
        'Outer space is not friendly to life. Extreme temperatures, low pressure and radiation can quickly degrade cell membranes and destroy DNA.'],
       ['sports',
        'Tennis, original name\xa0lawn tennis, game in which two opposing players (singles) or pairs of players (doubles) use tautly strung\xa0rackets\xa0to hit a\xa0ball\xa0of specified size, weight, and bounce over a\xa0net\xa0on a rectangular court.\xa0'],
       ['business',
        'One woman who frequently flew on Southwest was constantly disappointed with every aspect of the company’s operation. In fact, she became known as the “Pen Pal” because after every flight she wrote in with a complaint.'],
       ['covid',
        'In December 2019, almost seven years after the MERS 2012 outbreak, a novel Coronavirus (2019-nCoV) surfaced in Wuhan in the Hubei region of China.'],
       ['science',
        'Any life-forms that somehow find themselves in the void soon die.\xa0Unless they band together.'],


In [7]:
#Creating a dictionary of words for each news category
covid = {}
science = {}
business = {}
sports = {}

#Keeping a count of no of training examples in each category
n_covid = 0
n_science = 0
n_business = 0
n_sports = 0
for i in range(sentences.shape[0]):
    if sentences[i,0] == 'covid':
        n_covid += 1
        words = bag_of_words[i]
        for word in words:
            if word not in covid.keys():
                covid.update({word:1})
            else:
                covid[word]+=1
    elif sentences[i,0] == 'science':
        n_science += 1
        words = bag_of_words[i]
        for word in words:
            if word not in science.keys():
                science.update({word:1})
            else:
                science[word]+=1
    elif sentences[i,0] == 'business':
        n_business += 1
        words = bag_of_words[i]
        for word in words:
            if word not in business.keys():
                business.update({word:1})
            else:
                business[word]+=1
    elif sentences[i,0] == 'sports':
        n_sports += 1
        words = bag_of_words[i]
        for word in words:
            if word not in sports.keys():
                sports.update({word:1})
            else:
                sports[word]+=1
                
print(covid)                
print('The dictionary of words for each class was created')
len(sentences)

{'decemb': 2, '2019': 2, 'almost': 2, 'seven': 1, 'year': 3, 'mer': 4, '2012': 2, 'outbreak': 5, 'novel': 1, 'coronaviru': 5, '2019ncov': 2, 'surfac': 1, 'wuhan': 3, 'hubei': 2, 'region': 2, 'china': 5, 'cov': 2, 'larg': 1, 'famili': 3, 'virus': 8, 'known': 2, 'caus': 5, 'ill': 1, 'rang': 1, 'common': 2, 'cold': 2, 'acut': 3, 'respiratori': 6, 'tract': 1, 'infect': 5, 'sever': 2, 'may': 1, 'visibl': 1, 'pneumonia': 3, 'syndrom': 3, 'even': 1, 'death': 1, 'sar': 4, 'group': 1, 'greatli': 1, 'overlook': 1, 'howev': 2, 'sinc': 1, 'studi': 1, 'greater': 2, 'detail': 1, 'propel': 1, 'vaccin': 1, 'research': 2, '31': 1, 'mysteri': 1, 'case': 1, 'detect': 1, 'citi': 1, 'provinc': 2, 'januari': 1, '7': 1, '2020': 1, 'agent': 1, 'identifi': 1, 'new': 2, 'diseas': 2, 'later': 2, 'name': 1, 'covid19': 1, 'viru': 6, 'spread': 1, 'extens': 1, 'gain': 2, 'entri': 2, '210': 1, 'countri': 2, 'territori': 1, 'though': 1, 'expert': 1, 'suspect': 1, 'transmit': 1, 'anim': 2, 'human': 2, 'mix': 1, 'report

80

In [8]:
#Calculating the prior probability
pp_covid = n_covid/sentences.shape[0]
pp_science = n_science/sentences.shape[0]
pp_sports = n_sports/sentences.shape[0]
pp_business = n_business/sentences.shape[0]

print('The prior probabilities were evaluated')
pp_business

The prior probabilities were evaluated


0.2375

In [9]:
#Implementing Naive Bayes using our vocabulary on the test set
test_data = pd.read_csv('testdata.csv')
sample_sent = test_data.iloc[:, 0:2].values

class_cond_prob = []
post_dist_list = []
predicted_label = []
labels = {0:'covid', 1:'sports', 2:'business', 3:'science'}

sample_sent[:, 1] = [sentence.split() for sentence in sample_sent[:, 1]]
sample_sent[:, 1] = [[word.translate(table) for word in example] for example in sample_sent[:, 1]]
sample_sent[:, 1] = [[word.lower() for word in example] for example in sample_sent[:, 1]]
sample_sent[:, 1] = [[word for word in example if word not in stop_words] for example in sample_sent[:, 1]]
sample_sent[:, 1] = [[ps.stem(word) for word in example] for example in sample_sent[:, 1]]

for i in range(sample_sent.shape[0]):
    p_covid = 1
    p_sports = 1
    p_science = 1
    p_business = 1
    words = sample_sent[i,1]
    #print(words)
    for word in words:
        if word in covid.keys():
            p_covid *= covid[word]/sum(covid.values())
        else:
            p_covid *= 0.001
        if word in sports.keys():
            p_sports *= sports[word]/sum(sports.values()) 
        else:
            p_sports *= 0.001
        if word in business.keys():
            p_business *= business[word]/sum(business.values()) 
        else:
            p_business *= 0.001
        if word in science.keys():
            p_science *= science[word]/sum(science.values())
        else:
            p_science *= 0.001
            
    class_cond_prob.append([p_covid, p_sports, p_business, p_science])
    post_dist = [p_covid*pp_covid, p_sports*pp_sports, p_business*pp_business, p_science*pp_science]
    post_dist_list.append(post_dist)
    max_prob = max(post_dist)
    index = post_dist.index(max_prob)
    print(index)
    predicted_label.append(labels[index])
correct_pred = 0
for i in range(len(predicted_label)):
    if predicted_label[i] == sample_sent[i,0]:
        correct_pred += 1
#print(class_cond_prob)
print('Accuracy of Prediction : {}%'.format((correct_pred/len(predicted_label))*100))

3
3
3
3
3
1
1
1
1
1
2
2
2
2
2
0
0
0
0
0
Accuracy of Prediction : 100.0%


In [10]:
#Implementing the algorithm for guessing the last word of the sentence

#Importing the dataset
train_set = pd.read_csv('40.csv')
text = train_set.iloc[:,0].values

In [11]:
#Cleaning the train set

text = [bit.translate(table) for bit in text]
text = [bit.lower() for bit in text]
text = [[word for word in bit.split() if word not in stop_words] for bit in text]


In [12]:
#Building the vocabulary
vocab = dict()
for bit in text:
    for word in bit:
        vocab.update({word:{}})
for bit in text:
    for word in vocab.keys():
        if word in bit:
            for entity in bit:
                if entity in vocab[word].keys():
                    vocab[word][entity] += 1
                else:
                    vocab[word].update({entity:1})
print('The vocabulary was created successfully!!!')
vocab

The vocabulary was created successfully!!!


{'midst': {'midst': 1,
  'covid19': 1,
  'pandemic': 1,
  'eating': 1,
  'healthy': 1,
  'food': 1,
  'remains': 1,
  'important': 1,
  'part': 1,
  'maintaining': 1,
  'health': 1},
 'covid19': {'midst': 1,
  'covid19': 1,
  'pandemic': 1,
  'eating': 1,
  'healthy': 1,
  'food': 1,
  'remains': 1,
  'important': 1,
  'part': 1,
  'maintaining': 1,
  'health': 1},
 'pandemic': {'midst': 1,
  'covid19': 1,
  'pandemic': 1,
  'eating': 1,
  'healthy': 1,
  'food': 1,
  'remains': 1,
  'important': 1,
  'part': 1,
  'maintaining': 1,
  'health': 1},
 'eating': {'midst': 1,
  'covid19': 1,
  'pandemic': 1,
  'eating': 12,
  'healthy': 4,
  'food': 4,
  'remains': 1,
  'important': 1,
  'part': 1,
  'maintaining': 1,
  'health': 2,
  'diet': 2,
  'strict': 1,
  'limitations': 1,
  'staying': 1,
  'unrealistically': 1,
  'thin': 1,
  'depriving': 1,
  'foods': 5,
  'love': 2,
  'rather': 1,
  'it’s': 1,
  'feeling': 1,
  'great': 1,
  'energy': 1,
  'improving': 1,
  'boosting': 1,
  'mood'

In [13]:
#Calculating the prior probabilities of all classes
prior_prob = {word:0 for word in vocab.keys()}
total = 0
for word in vocab.keys():
    count = 0
    for bit in vocab[word].keys():
        count += vocab[word][bit]
    total += count
    prior_prob[word] = count
for word in vocab.keys():
    prior_prob[word] = prior_prob[word]/total
prior_prob = [key for key in prior_prob.values()]
print('The prior probabilities of all the labels have been calculated!!!')
prior_prob

The prior probabilities of all the labels have been calculated!!!


[0.0012086583891880013,
 0.0012086583891880013,
 0.0012086583891880013,
 0.020327436545434566,
 0.02065707065157675,
 0.011207559608834194,
 0.0012086583891880013,
 0.004395121415229096,
 0.0012086583891880013,
 0.0012086583891880013,
 0.00758158444127019,
 0.0014284144599494562,
 0.017470607625535657,
 0.01263597406878365,
 0.0028568289198989123,
 0.0014284144599494562,
 0.0014284144599494562,
 0.014503900670256016,
 0.002746950884518185,
 0.0014284144599494562,
 0.0014284144599494562,
 0.0014284144599494562,
 0.004065487309086914,
 0.0016481705307109108,
 0.0016481705307109108,
 0.0016481705307109108,
 0.005384023733655642,
 0.0016481705307109108,
 0.0016481705307109108,
 0.0016481705307109108,
 0.0016481705307109108,
 0.0058235358751785515,
 0.008350730688935281,
 0.004614877485990551,
 0.0016481705307109108,
 0.0016481705307109108,
 0.0016481705307109108,
 0.002197560707614548,
 0.002197560707614548,
 0.0037358532029447312,
 0.002197560707614548,
 0.002197560707614548,
 0.002197560

In [14]:
#Importing the test data set
test_set = pd.read_csv('10.csv')
test_text = test_set.iloc[:,0].values

#Cleaning the train set
test_text = [bit.translate(table) for bit in test_text]
test_text = [bit.lower() for bit in test_text]
test_text = [[word for word in bit.split() if word not in stop_words] for bit in test_text]


In [15]:
#Calculating the class conditional probabilities
labels = [element for element in vocab.keys()]
label_vocab = [dictionaries for dictionaries in vocab.values()]
ccp = []

for bit in test_text:
    ccp_sent = [1]*len(vocab.keys())
    #print(ccp_sent)
    temp = []
    for word in bit:
        prob = [0.001]*len(vocab.keys())
        for i in range(len(label_vocab)):
            if word in label_vocab[i].keys():
                prob[i] = label_vocab[i][word]/sum(label_vocab[i].values())
        temp.append(prob)
        #print(temp)
    for i in range(len(vocab.keys())):
        for j in range(len(temp)):
            ccp_sent[i] *= temp[j][i]
    ccp.append(ccp_sent)

#print(ccp)
post_prob = []
for term in ccp:
    entity = [term[i]*prior_prob[i] for i in range(len(labels))]
    post_prob.append(entity)

#print(post_prob)
last_word_pred = []
confidence = []
for term in post_prob:
    last_word_pred.append(labels[term.index(max(term))])
    confidence.append(max(term))
    
print(last_word_pred)
print(confidence)

['midst', 'diet', 'feeling', 'enough', 'meal', 'drink', 'disease', 'foods', 'front', 'food']
[8.255299427552772e-14, 8.324093589449046e-10, 1.5142403920397267e-20, 6.053886246872034e-13, 8.8315036153491e-10, 6.867377211295465e-22, 7.062260283827686e-17, 8.069658306616578e-26, 1.0987803538072745e-17, 7.961593757432509e-13]
