In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import re

In [2]:
data=pd.read_excel('root2ai.xlsx')

In [3]:
from sklearn.model_selection import train_test_split
train_data,test_data=train_test_split(data,test_size=0.2,random_state=0)

In [4]:
stopword=set(stopwords.words('english'))
porter=PorterStemmer()

In [5]:
#Helper Functions
# Used Porter Stemmer
def clean(sent):
    retr=[]
    for word in word_tokenize(sent):
        if word not in stopword and word.isalpha():
            retr.append(porter.stem(word))
#            retr.append(word)
    return retr
def process(data,task='train'):
    retr=dict()
    allsent=[]
    for i in range(len(data)):
        category,sentence=data.iloc[i,0],data.iloc[i,1]
        if category not in retr:
            retr[category]=[]
        if(not isinstance(sentence,bool)):
            cleaned=clean(re.sub('[^\w\s]',' ',sentence.lower()))
            retr[category].append(cleaned)
            allsent.append(cleaned)
    return retr,allsent
def vectorize(sent,vocab):
    vector=np.zeros(len(vocab))
    for word in sent:
        vector[vocab[word]]+=1
    return vector
def last(data):
    y_test=[]
    allsent=[]
    for i in range(len(data)):
        category,sentence=data.iloc[i,0],data.iloc[i,1]
        if(not isinstance(sentence,bool)):
            y_test.append(category)
            cleaned=clean(re.sub('[^\w\s]',' ',sentence.lower()))
            allsent.append(cleaned)
    return y_test,allsent

## Create Vocabulary

In [6]:
train,train_sent=process(train_data,'train')
vocab={}
n=0
for l in train_sent:
    for word in l:
        if word not in vocab:
            vocab[word]=n
            n+=1
print('Vocabulary of words in Train data \n',vocab)

Vocabulary of words in Train data 
 {'technolog': 0, 'need': 1, 'transact': 2, 'intermediari': 3, 'clearinghous': 4, 'financi': 5, 'establish': 6, 'thu': 7, 'quick': 8, 'secur': 9, 'inexpens': 10, 'phase': 11, 'opportun': 12, 'creat': 13, 'term': 14, 'cash': 15, 'flow': 16, 'asset': 17, 'base': 18, 'trade': 19, 'exit': 20, 'relev': 21, 'import': 22, 'digit': 23, 'receipt': 24, 'good': 25, 'payment': 26, 'bank': 27, 'export': 28, 'smart': 29, 'contract': 30, 'futur': 31, 'friction': 32, 'walker': 33, 'chief': 34, 'market': 35, 'offic': 36, 'space': 37, 'famous': 38, 'accur': 39, 'like': 40, 'video': 41, 'tablet': 42, 'changa': 43, 'member': 44, 'acceler': 45, 'definit': 46, 'data': 47, 'continu': 48, 'evolv': 49, 'accord': 50, 'kryder': 51, 'summar': 52, 'year': 53, 'gener': 54, 'busi': 55, 'main': 56, 'use': 57, 'studi': 58, 'classif': 59, 'countri': 60, 'last': 61, 'count': 62, 'list': 63, 'wherea': 64, 'hand': 65, 'rival': 66, 'natur': 67, 'budget': 68, 'anoth': 69, 'massiv': 70, 'tr

# Create Prior Distribution of Each label 

In [7]:
pditri={cat:len(train[cat])/len(train_sent) for cat in train}
print('Prior Distribution of Labels:\n',pditri)

Prior Distribution of Labels:
 {'Blockchain': 0.05996696035242291, 'FinTech': 0.3765969162995595, 'Stock Trading': 0.033700440528634364, 'Bigdata': 0.1, 'Neobanks': 0.04774229074889868, 'Cyber Security': 0.11563876651982379, 'credit reporting': 0.07797356828193833, 'Reg Tech': 0.09713656387665198, 'Robo Advising': 0.03303964757709251, 'Microservices': 0.0433920704845815, 'Data Security': 0.014812775330396476}


# Create Conditional Distribution of each word

In [8]:
n_train=dict()
for cat in train:
    n_train[cat]=np.zeros(n)
    for sent in train[cat]:
        n_train[cat]+=vectorize(sent,vocab)
table=pd.DataFrame(n_train,index=[i for i,j in sorted(vocab.items(),key=lambda pair:pair[1])])
table=table.replace(0,0.1)
table=table/table.sum()
print('Conditional Distribution for each word\n',table)

Conditional Distribution for each word
                Blockchain   FinTech  Stock Trading   Bigdata  Neobanks  \
technolog        0.011018  0.007653       0.004132  0.006746  0.004523   
need             0.005984  0.005667       0.001127  0.003644  0.005088   
transact         0.004844  0.002087       0.000563  0.001241  0.002261   
intermediari     0.000475  0.000085       0.000188  0.000008  0.000141   
clearinghous     0.000095  0.000002       0.000019  0.000008  0.000014   
financi          0.003704  0.019123       0.004507  0.007211  0.007350   
establish        0.000950  0.001493       0.000376  0.000233  0.001979   
thu              0.001900  0.000950       0.000939  0.000310  0.000014   
quick            0.000095  0.000238       0.000188  0.000155  0.000141   
secur            0.003229  0.002223       0.000376  0.002714  0.003251   
inexpens         0.000190  0.000017       0.000019  0.000008  0.000014   
phase            0.000665  0.000441       0.000188  0.000008  0.000014  

# Apply Naive-Bayes Model on Test set

In [9]:
pd.set_option('display.float_format', lambda x: '%.9f' % x)
y_test,test=last(test_data)
distri=[]
for sent in test:
    temp=dict(pditri)
    for word in sent:
        if word not in vocab:
            continue
        for clas in temp:
            temp[clas]*=table.loc[word,clas]
    k=sum(temp.values())
    for clas in temp:
        temp[clas]/=k
    distri.append(temp)
final=pd.DataFrame(distri)
print('Index Represent i th sentence in test set')
print('Posterior Distribution of each sentence over different labels')
print(final)

  del sys.path[0]


Index Represent i th sentence in test set
Posterior Distribution of each sentence over different labels
         Bigdata  Blockchain  Cyber Security  Data Security     FinTech  \
0    0.000763894 0.000144147     0.000586949    0.000000683 0.222831941   
1    0.000000003 0.003340585     0.992539680    0.000000261 0.000456309   
2    0.000000000 0.000000000     0.000000000    0.000006582 0.000000000   
3    0.914234274 0.008394707     0.009342349    0.007205938 0.009417752   
4    0.001225683 0.007054471     0.001539021    0.000000019 0.936244720   
5    0.000000212 0.000000129     0.055402422    0.000000008 0.025862535   
6    0.000000000 0.000000000     0.000036349    0.000000000 0.000000051   
7    0.001090294 0.142687791     0.001289923    0.000076486 0.680709823   
8    0.003436377 0.014317383     0.812382808    0.001648137 0.030162212   
9    0.000000077 0.000056421     0.000001375    0.000000000 0.994133750   
10   0.100000000 0.059966960     0.115638767    0.014812775 0.376596916

In [10]:
y_pred=[]
for i in range(0,len(final)):
    mx=final.iloc[i,:]
    val=0
    sen=""
    for j in range(0,11):
        if(mx[j]>val):
            val=mx[j]
            sen=final.columns[j]
    y_pred.append(sen)

In [11]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)

In [12]:
accuracy

0.6513215859030838