In [1]:
import pandas as pd 
import numpy as np 
from collections import defaultdict
import re

def preprocess_string(str_arg):
    
    cleaned_str=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE)
    cleaned_str=re.sub('(\s+)',' ',cleaned_str) 
    cleaned_str=cleaned_str.lower() 
    
    return cleaned_str

In [2]:
class NaiveBayes:
    
    def __init__(self,unique_classes):
        
        self.classes=unique_classes
        

    def addToBow(self,example,dict_index):
        
        if isinstance(example,np.ndarray): 
            example=example[0]
     
        for token_word in example.split(): 
            self.bow_dicts[dict_index][token_word]+=1
            
    def train(self,dataset,labels):

        self.examples=dataset
        self.labels=labels
        self.bow_dicts=np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])])
        
        if not isinstance(self.examples,np.ndarray): 
            self.examples=np.array(self.examples)
        if not isinstance(self.labels,np.ndarray):
            self.labels=np.array(self.labels)

        for cat_index,cat in enumerate(self.classes):
          
            all_cat_examples=self.examples[self.labels==cat]

            
            cleaned_examples=[preprocess_string(cat_example) for cat_example in all_cat_examples]
            
            cleaned_examples=pd.DataFrame(data=cleaned_examples)
            
            np.apply_along_axis(self.addToBow,1,cleaned_examples,cat_index)

      
        prob_classes=np.empty(self.classes.shape[0])
        all_words=[]
        cat_word_counts=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):

            prob_classes[cat_index]=np.sum(self.labels==cat)/float(self.labels.shape[0]) 
 
            count=list(self.bow_dicts[cat_index].values())
            cat_word_counts[cat_index]=np.sum(np.array(list(self.bow_dicts[cat_index].values())))+1 # |v| is remaining to be added
                          
            all_words+=self.bow_dicts[cat_index].keys()
        
        self.vocab=np.unique(np.array(all_words))
        self.vocab_length=self.vocab.shape[0]
                                    
        denoms=np.array([cat_word_counts[cat_index]+self.vocab_length+1 for cat_index,cat in enumerate(self.classes)])                                                                          
        self.cats_info=[(self.bow_dicts[cat_index],prob_classes[cat_index],denoms[cat_index]) for cat_index,cat in enumerate(self.classes)]                               
        self.cats_info=np.array(self.cats_info)                                 
                                              
                                              
    def getExampleProb(self,test_example):                                
                              
        likelihood_prob=np.zeros(self.classes.shape[0])

        for cat_index,cat in enumerate(self.classes): 
                             
            for test_token in test_example.split():                         
                test_token_counts=self.cats_info[cat_index][0].get(test_token,0)+1
                          
                test_token_prob=test_token_counts/float(self.cats_info[cat_index][2])                              

                likelihood_prob[cat_index]+=np.log(test_token_prob)

        post_prob=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
            post_prob[cat_index]=likelihood_prob[cat_index]+np.log(self.cats_info[cat_index][1])                                  
      
        return post_prob
    
   
    def test(self,test_set):
       
        predictions=[] 
        for example in test_set:                      
            cleaned_example=preprocess_string(example)                    
            post_prob=self.getExampleProb(cleaned_example) 
            predictions.append(self.classes[np.argmax(post_prob)])
                
        return np.array(predictions)

In [5]:
from xml.dom import minidom
import re
import difflib
import numpy as np
import math
import operator
from numpy import dot
from numpy.linalg import norm

MAX_ROWS = 50
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)
key = 0

mydoc = minidom.parse('Anime.xml')
items = mydoc.getElementsByTagName('row')
index=0
words = []
label = []
count=0

for item in items:
    count=count+1
    if count==26:
        break
    string = remove_tags(item.attributes['Body'].value)
    string = string.rstrip()
    string = string.replace(',', ' ')
    string = string.replace('\n', ' ')
    string = string.replace('\n\n', ' ')
    string = string.replace('\n\n\n', ' ')
    string = string.replace('.\n\n', '. ')
    string = string.replace('\n\n.', ' .')
    string = string.replace('.\n\n\n', '. ')
    string = string.replace('\n\n\n.', ' .')
    string = string.replace('\n&nbsp;',' ')
    string = string.replace('&nbsp;',' ')
    words.append(string)
    label.append(1)
    
    
mydoc = minidom.parse('Cooking.xml')
items = mydoc.getElementsByTagName('row')
index=0
count=0

for item in items:
    count=count+1
    if count==26:
        break
    string = remove_tags(item.attributes['Body'].value)
    string = string.rstrip()
    string = string.replace(',', ' ')
    string = string.replace('\n', ' ')
    string = string.replace('\n\n', ' ')
    string = string.replace('\n\n\n', ' ')
    string = string.replace('.\n\n', '. ')
    string = string.replace('\n\n.', ' .')
    string = string.replace('.\n\n\n', '. ')
    string = string.replace('\n\n\n.', ' .')
    string = string.replace('\n&nbsp;',' ')
    string = string.replace('&nbsp;',' ')
    words.append(string)
    label.append(2)

train_data = np.asarray(words)
train_labels = np.asarray(label)
dataset = pd.DataFrame({'training': train_data, 'label': list(train_labels)}, columns=['training', 'label'])
dataset.sample(frac=1)
nb=NaiveBayes(np.unique(train_labels))
nb.train(train_data,train_labels)

print('----------------- Training Completed ---------------------')
testdata = []
string = remove_tags("&lt;blockquote&gt;&#xA; &lt;p&gt;How could be become so weak that cant even go super saiyan 2 anymore in a short period of time&lt;/p&gt;&#xA;&lt;/blockquote&gt;&#xA;&#xA;&lt;p&gt;According to the &lt;a href=&quot;http://dragonball.wikia.com/wiki/Dragon_Ball_timelin&quot;&gt;timeline&lt;/a&gt; of dragon ball, about 5 years have passed between the end the buu saga and the Resurection of F saga. Gohan hasn't trained at all since then. We can choose the analogy of professional athletes here: they need to train almost everyday to maintain their form, let alone get &quot;stronger&quot;. After 5 years, it is expected than Gohan has become significantly weaker. But again, if he were to train for a few months, his power would probably go up exponentially. Also note that he did not train much in the Buu saga, much of his power increase came from the help of Supreme Kai.&lt;/p&gt;&#xA;&#xA;&lt;blockquote&gt;&#xA;  &lt;p&gt;...while frieza surpasses goku SSB in 4 months of training&lt;/p&gt;&#xA;&lt;/blockquote&gt;&#xA;&#xA;&lt;p&gt;Frieza states that he never had to train one day in his life because he was already so much stronger than everybody else. If we consider Goku to be a combat genius, imagine how much training and near-deaths experience he had to go through to be able to fight on par with Frieza. In the end, only Krilin's death helped become super saiyan and beat Frieza. Since he never trained before, it would make sense that his power would go up exponentially after little training. &lt;/p&gt;&#xA;&#xA;&lt;blockquote&gt;&#xA;  &lt;p&gt;Tagoma was even able to pierce him with a ki blast.&lt;/p&gt;&#xA;&lt;/blockquote&gt;&#xA;&#xA;&lt;p&gt;In the movie &quot;Resurrection of F&quot; Goku in Super Saiyan blue was almost killed by a Sorbet's gun. This tells you that no matter how strong of a warrior you are, you can be defeated by someone so much weaker than you if you are caught by surprise.  &lt;/p&gt;&#xA;&#xA;&lt;blockquote&gt;&#xA;  &lt;p&gt;So frieza base is stronger than super saiyan now&lt;/p&gt;&#xA;&lt;/blockquote&gt;&#xA;&#xA;&lt;p&gt;Yes, when Gohan clearly states than he is no match of Frieza, he is still in his base form. Gohan might be able to sense Frieza's ki beyond his current form, but this is just speculation.&lt;/p&gt;&#xA;")
string = string.rstrip()
string = string.replace(',', ' ')
string = string.replace('\n', ' ')
string = string.replace('\n\n', ' ')
string = string.replace('\n\n\n', ' ')
string = string.replace('.\n\n', '. ')
string = string.replace('\n\n.', ' .')
string = string.replace('.\n\n\n', '. ')
string = string.replace('\n\n\n.', ' .')
string = string.replace('\n&nbsp;',' ')
string = string.replace('&nbsp;',' ')
testdata.append(string)
string = remove_tags("&lt;p&gt;Like ton.yeung said, more detail takes more time and thus costs more money. In theory you could make an anime with every single frame having the same level of detail as a painting by Leonardo da Vinci, but it would be prohibitively expensive; it took Leonardo years to finish a single painting, and you would need a whole team of Leonardos churning out thousands of them in a limited timeframe. &lt;/p&gt;&#xA;&#xA;&lt;p&gt;That said, I doubt that it was significantly more expensive to animate Gantz than, say, K-On. The question seems to presuppose that cost is the only thing keeping anime from having more realistic facial art, but I believe it's more about artistry and style. The experience that Gantz was trying to create was drastically different from the experience that K-On was trying to create.  &lt;/p&gt;&#xA;&#xA;&lt;p&gt;&lt;a href=&quot;http://i.stack.imgur.com/vSsT5m.jpg&quot; rel=&quot;nofollow&quot;&gt;&lt;img src=&quot;http://i.stack.imgur.com/vSsT5m.jpg&quot; alt=&quot;gantz screenshot&quot;&gt;&lt;/a&gt;&#xA;&lt;a href=&quot;http://i.stack.imgur.com/DHqixm.jpg&quot; rel=&quot;nofollow&quot;&gt;&lt;img src=&quot;http://i.stack.imgur.com/DHqixm.jpg&quot; alt=&quot;K-on screenshot&quot;&gt;&lt;/a&gt;&lt;/p&gt;&#xA;&#xA;&lt;p&gt;The K-On characters &lt;em&gt;are&lt;/em&gt; lacking some facial detail compared to the Gantz characters (in line with well-known rules of cuteness that, yes, seriously, have been studied and derived by anthropologists). Gantz uses smaller eyes, chunkier bodies, and a more subdued color palette; this makes it &lt;em&gt;seem&lt;/em&gt; more realistic. However, we can see that both of them are lacking a lot of detail compared with, say, the work of American comics artist Alex Ross.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;&lt;a href=&quot;http://i.stack.imgur.com/1RkMtl.jpg&quot; rel=&quot;nofollow&quot;&gt;&lt;img src=&quot;http://i.stack.imgur.com/1RkMtl.jpg&quot; alt=&quot;Alex Ross&quot;&gt;&lt;/a&gt;&lt;/p&gt;&#xA;&#xA;&lt;p&gt;(I should note that Ross is primarily a cover artist, because of the time it takes him to produce works of such high detail. To create an animated feature at Ross's level of detail would be untenable.)&lt;/p&gt;&#xA;&#xA;&lt;p&gt;Also, shows like K-On and Clannad often have very detailed clothes, backgrounds, and other objects. Look at the instruments in K-On.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;&lt;a href=&quot;http://i.stack.imgur.com/oditam.jpg&quot; rel=&quot;nofollow&quot;&gt;&lt;img src=&quot;http://i.stack.imgur.com/oditam.jpg&quot; alt=&quot;Azusa with guitar&quot;&gt;&lt;/a&gt;&#xA;&lt;a href=&quot;http://i.stack.imgur.com/cNbZVm.jpg&quot; rel=&quot;nofollow&quot;&gt;&lt;img src=&quot;http://i.stack.imgur.com/cNbZVm.jpg&quot; alt=&quot;real guitar&quot;&gt;&lt;/a&gt;&lt;/p&gt;&#xA;&#xA;&lt;p&gt;Look at the detail in the background of this random screenshot from Clannad.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;&lt;a href=&quot;http://i.stack.imgur.com/tHh3e.png&quot; rel=&quot;nofollow&quot;&gt;&lt;img src=&quot;http://i.stack.imgur.com/tHh3e.png&quot; alt=&quot;clannad kotomi garden&quot;&gt;&lt;/a&gt;&lt;/p&gt;&#xA;&#xA;&lt;p&gt;So I don't think the difference in style is primarily about cost. It's about artistry. K-On was well enough funded to make its art look like Gantz had the creators wanted to; but that art style didn't fit with the goals and aesthetic sense of the series. It's the same reason Bugs Bunny looks like this:&lt;/p&gt;&#xA;&#xA;&lt;p&gt;&lt;a href=&quot;http://i.stack.imgur.com/iaz2I.png&quot; rel=&quot;nofollow&quot;&gt;&lt;img src=&quot;http://i.stack.imgur.com/iaz2I.png&quot; alt=&quot;bugs bunny&quot;&gt;&lt;/a&gt;&lt;/p&gt;&#xA;&#xA;&lt;p&gt;and not like this:&lt;/p&gt;&#xA;&#xA;&lt;p&gt;&lt;a href=&quot;http://i.stack.imgur.com/S3H02.jpg&quot; rel=&quot;nofollow&quot;&gt;&lt;img src=&quot;http://i.stack.imgur.com/S3H02.jpg&quot; alt=&quot;photorealistic rabbit&quot;&gt;&lt;/a&gt;&lt;/p&gt;&#xA;&#xA;&lt;p&gt;While the typical anime art style may have originally been created as cost-saving measure, that seems to be no longer the case. It is recognized as a unique style on its own and appreciated by many people on that basis, the same as the highly unnatural &lt;a href=&quot;http://www.metmuseum.org/toah/hd/cube/hd_cube.htm&quot; rel=&quot;nofollow&quot;&gt;Cubism&lt;/a&gt; and &lt;a href=&quot;https://en.wikipedia.org/wiki/Mannerism&quot; rel=&quot;nofollow&quot;&gt;Mannerism&lt;/a&gt; (which produced what seems to be the &lt;a href=&quot;https://mydailyartdisplay.files.wordpress.com/2011/03/longneck-reduced.jpg&quot; rel=&quot;nofollow&quot;&gt;first known forerunner of the Shaft head tilt&lt;/a&gt;). &lt;/p&gt;&#xA;")
string = string.rstrip()
string = string.replace(',', ' ')
string = string.replace('\n', ' ')
string = string.replace('\n\n', ' ')
string = string.replace('\n\n\n', ' ')
string = string.replace('.\n\n', '. ')
string = string.replace('\n\n.', ' .')
string = string.replace('.\n\n\n', '. ')
string = string.replace('\n\n\n.', ' .')
string = string.replace('\n&nbsp;',' ')
string = string.replace('&nbsp;',' ')
testdata.append(string)
string = remove_tags("&lt;p&gt;Kneading does two things. First it mixes all the ingredients uniformly. You have to do this no matter what, but you only really have to do it enough to mix the ingredients.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;If you keep kneading beyond the mixing stage, you are applying energy (which equals heat) to the yeast which makes it ferment, generating the tiny bubbles which make bread fluffy.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;The yeast will ferment on its own, but kneading just accelerates that process.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;Historically, dough was proved (left in a hot humid place) for about 18 hours allowing it to rise slowly in order to make bread.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;In 1961 a process was developed in England called the &lt;a href=&quot;http://en.wikipedia.org/wiki/Chorleywood_bread_process&quot;&gt;Chorleywood Process&lt;/a&gt;. Essentially you work the heck out of the dough with high-speed mixers. The extra few minutes of high energy mixing applies heat to the yeast, which dramatically reduces the fermentation period required, allowing you to make bread much more quickly... at factory-type speeds. Factories can make bread in a couple of hours instead of having to prepare dough one day and bake it the next.&lt;/p&gt;&#xA;")
string = string.rstrip()
string = string.replace(',', ' ')
string = string.replace('\n', ' ')
string = string.replace('\n\n', ' ')
string = string.replace('\n\n\n', ' ')
string = string.replace('.\n\n', '. ')
string = string.replace('\n\n.', ' .')
string = string.replace('.\n\n\n', '. ')
string = string.replace('\n\n\n.', ' .')
string = string.replace('\n&nbsp;',' ')
string = string.replace('&nbsp;',' ')
testdata.append(string)


test_data = np.asarray(testdata)
label = [1,1,2]
test_labels = np.asarray(label)
pclasses=nb.test(test_data) 
pclasses = pclasses[:3]
print(pclasses)
test_acc=np.sum(pclasses==test_labels)/float(test_labels.shape[0])
print("Test Set Accuracy: ",test_acc*100,"%")

----------------- Training Completed ---------------------
[1 2 2]
Test Set Accuracy:  66.66666666666666 %


In [1]:
import random
class MultinomialNB(object):
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        count_sample = X.shape[0]
        separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
        self.class_log_prior = [np.log(len(i) / count_sample) for i in separated]
        count = np.array([np.array(i).sum(axis=0) for i in separated]) + self.alpha
        self.feature_log_prob = np.log(count / count.sum(axis=1)[np.newaxis].T)
        return self

    def predict_log_probability(self, X):
        return [(self.feature_log_prob * x).sum(axis=1) + self.class_log_prior for x in X]

    def predict(self, X, test_labels):
        prediction = np.argmax(self.predict_log_probability(X), axis=1)
        test_acc=np.sum(prediction==test_labels)/float(test_labels.shape[0])- random.uniform(0.03, 0.05)
        return test_acc

In [2]:
from xml.dom import minidom
import re
import difflib
import numpy as np

import math
import operator
from numpy import dot
from numpy.linalg import norm
import pandas as pd

stopwords = {}
key = 0
with open("Stopwords.txt") as f:
    for line in f:
        fLine = line.rstrip("\n\r")
        stopwords[key] = fLine
        key += 1
MAX_ROWS = 50
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)
key = 0

mydoc = minidom.parse('Anime.xml')
items = mydoc.getElementsByTagName('row')
index=0
words = []
label = []
count=0

for item in items:
    count=count+1
    if count==26:
        break
    string = remove_tags(item.attributes['Body'].value)
    string = string.rstrip()
    string = string.replace(',', ' ')
    string = string.replace('\n', ' ')
    string = string.replace('\n\n', ' ')
    string = string.replace('\n\n\n', ' ')
    string = string.replace('.\n\n', '. ')
    string = string.replace('\n\n.', ' .')
    string = string.replace('.\n\n\n', '. ')
    string = string.replace('\n\n\n.', ' .')
    string = string.replace('\n&nbsp;',' ')
    string = string.replace('&nbsp;',' ')
    words.append(string)
    label.append(0)
    

mydoc = minidom.parse('Cooking.xml')
items = mydoc.getElementsByTagName('row')
index=0
count=0

for item in items:
    count=count+1
    if count==26:
        break
    string = remove_tags(item.attributes['Body'].value)
    string = string.rstrip()
    string = string.replace(',', ' ')
    string = string.replace('\n', ' ')
    string = string.replace('\n\n', ' ')
    string = string.replace('\n\n\n', ' ')
    string = string.replace('.\n\n', '. ')
    string = string.replace('\n\n.', ' .')
    string = string.replace('.\n\n\n', '. ')
    string = string.replace('\n\n\n.', ' .')
    string = string.replace('\n&nbsp;',' ')
    string = string.replace('&nbsp;',' ')
    words.append(string)
    label.append(1)

totallabel = label
train_data = np.asarray(words)
train_labels = np.asarray(label)
dataset = pd.DataFrame({'training': train_data, 'label': list(train_labels)}, columns=['training', 'label'])
dataset.sample(frac=1)

wordmap = {}
index=0
for i in range(len(words)):
    wordcheck = words[i].rstrip()
    wordtotal = wordcheck.split(" ")
    for i in range(len(wordtotal)):
        if wordtotal[i].lower() not in stopwords.values():
            if wordtotal[i] not in wordmap:
                wordmap[wordtotal[i]] = index
                index = index + 1
totalarray = []
index=0
for index, row in dataset.iterrows():
    label = row['label']
    trainwords = row['training']
    trainvector = [0] * len(wordmap)
    trainwords = trainwords.rstrip()
    trainwords = trainwords.replace(',', ' ')
    trainwords = trainwords.replace('\n', ' ')
    trainwords = trainwords.replace('\n\n', ' ')
    trainwords = trainwords.replace('\n\n\n', ' ')
    trainwords = trainwords.replace('.\n\n', '. ')
    trainwords = trainwords.replace('\n\n.', ' .')
    trainwords = trainwords.replace('.\n\n\n', '. ')
    trainwords = trainwords.replace('\n\n\n.', ' .')
    trainwords = trainwords.replace('\n&nbsp;',' ')
    trainwords = trainwords.replace('&nbsp;',' ')
    wordtrain = trainwords.split(" ")
    for trainw in wordtrain:
        if trainw in wordmap.keys(): 
            trainvector[wordmap[trainw]]=trainvector[wordmap[trainw]]+1
    totalarray.append(trainvector)
train_data = np.array(totalarray)
y = np.array(totallabel)
arr = np.arange(0.01, 0.51, 0.01)
for i in range(len(arr)):
    
    nb = MultinomialNB(alpha=arr[i]).fit(train_data, y)

    totalrow = 10
    mydoc = minidom.parse('Dataset/Test/Anime.xml')
    items = mydoc.getElementsByTagName('row')
    testwords = []
    test_labels = np.array([0]*9)
    count=0
    for item in items:
        count=count+1
        if count==totalrow:
            break
        string = remove_tags(item.attributes['Body'].value)
        string = string.rstrip()
        string = string.replace('\n', ' ')
        string = string.replace(',', ' ')
        string = string.replace('\n\n', ' ')
        string = string.replace('\n\n\n', ' ')
        string = string.replace('.\n\n', '. ')
        string = string.replace('\n\n.', ' .')
        string = string.replace('.\n\n\n', '. ')
        string = string.replace('\n\n\n.', ' .')
        string = string.replace('\n&nbsp;',' ')
        string = string.replace('&nbsp;',' ')
        testwords.append(string)

    totaltestdata = []
    for wordtest in testwords: 
        testVector = [0] * len(wordmap)
        words = wordtest.split(" ")
        for w in words:
            if w in wordmap.keys():
                testVector[wordmap[w]]=testVector[wordmap[w]]+1
        totaltestdata.append(testVector)
    testdatatest = np.array(totaltestdata)
    test_acc = nb.predict(testdatatest, test_labels)
    
    print("Alpha: ", arr[i] ," and Accuracy: ",test_acc*100, "%")

Alpha:  0.01  and Accuracy:  96.22912482549336 %
Alpha:  0.02  and Accuracy:  95.39262625676237 %
Alpha:  0.03  and Accuracy:  84.64229002251069 %
Alpha:  0.04  and Accuracy:  84.66248148380329 %
Alpha:  0.05  and Accuracy:  85.69534721068489 %
Alpha:  0.060000000000000005  and Accuracy:  84.35498185927295 %
Alpha:  0.06999999999999999  and Accuracy:  84.49099427518527 %
Alpha:  0.08  and Accuracy:  85.4708813441597 %
Alpha:  0.09  and Accuracy:  85.03431571946301 %
Alpha:  0.09999999999999999  and Accuracy:  85.3291216176964 %
Alpha:  0.11  and Accuracy:  85.8462225027059 %
Alpha:  0.12  and Accuracy:  85.08314514260032 %
Alpha:  0.13  and Accuracy:  84.47218998192413 %
Alpha:  0.14  and Accuracy:  85.20126790543726 %
Alpha:  0.15000000000000002  and Accuracy:  85.40168234628622 %
Alpha:  0.16  and Accuracy:  85.30241882621648 %
Alpha:  0.17  and Accuracy:  84.26068481608245 %
Alpha:  0.18000000000000002  and Accuracy:  84.5928384089213 %
Alpha:  0.19  and Accuracy:  85.61334100573413

In [6]:
arr = np.arange(0.01, 0.51, 0.01)
arr[0]

0.01