[View in Colaboratory](https://colab.research.google.com/github/Carlitoshsh/-ML-Lab2/blob/master/Lab_2_Item_3.ipynb)

#Lab2
##Item 3



In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [30]:
df = pd.read_table('/content/gdrive/My Drive/Colab Notebooks/TableLab2.txt',  
                   sep=':', 
                   header=None,
                   names=['c = China?','words in document'])
# label spam as 1, not spam as 0
df['c = China?'] = df['c = China?'].replace(["no","yes"],[0,1])
data = df.values

print(data)

[[1 'Chinese Beijing Chinese']
 [1 'Chinese Chinese Shanghai']
 [1 'Chinese Macao']
 [0 'Tokyo Japan Chinese']
 [1 'Taipei Taiwan']
 [1 'Macao Taiwan Shanghai']
 [0 'Japan Sapporo']
 [0 'Sapporo Osaka Taiwan']]


In [0]:
class ngrams_bayes():
    
    def __init__(self, data, n=2, split=0.75):
        
        # split into training and testing data
        self.train_data, self.test_data = train_test_split(data,
                                                          train_size=split)
        # convert into n grams
        self.train_data = [[item[0], self.ngrams(n, item[1])] for item in self.train_data]
        self.test_data = [[item[0], self.ngrams(n, item[1])] for item in self.test_data]
        
        # count unique n grams in training data
        flattened = [gram for message in self.train_data for gram in message[1]]
        self.unique = len(set(flattened))
        
        # init dicts
        self.trainPositive = {}
        self.trainNegative = {}
        # counters
        self.posGramCount = 0
        self.negGramCount = 0
        self.spamCount = 0
        # priors
        self.pA = 0
        self.pNotA = 0
        
    def ngrams(self, n, text):
        text = text.split(' ')
        grams = []
        for i in range(len(text)-n+1):
            gram = ' '.join(text[i:i+n])
            grams.append(gram)
        return grams 
    
    def train(self):
        
        for item in self.train_data:
            label = item[0]
            grams = item[1]
            if label == 1:
                self.spamCount += 1   
            for gram in grams:
                if label == 1:
                    self.trainPositive[gram] = self.trainPositive.get(gram, 0) + 1
                    self.posGramCount += 1
                else:
                    self.trainNegative[gram] = self.trainNegative.get(gram, 0) + 1
                    self.negGramCount += 1
                    
        self.pA = self.spamCount/float(len(self.train_data))
        self.pNotA = 1.0 - self.pA
        
    def classify(self, text, alpha=1.0):
        
        self.alpha = alpha
        isSpam = self.pA * self.conditionalText(text, 1)
        notSpam = self.pNotA * self.conditionalText(text, 0)
        if (isSpam > notSpam):
            return 1
        else:
            return 0
        
    def conditionalText(self, grams, label):
        result = 1.0
        for ngram in grams:
            result *= self.conditionalNgram(ngram, label)
        return result
    
    def conditionalNgram(self, ngram, label):
        alpha = self.alpha
        if label == 1:
            return ((self.trainPositive.get(ngram,0)+alpha) /
                    float(self.posGramCount+alpha*self.unique))
        else:
            return ((self.trainNegative.get(ngram,0)+alpha) /
                    float(self.negGramCount+alpha*self.unique))
            
    def evaluate_test_data(self):
        results = []
        for test in self.test_data:
            label = test[0]
            text = test[1]
            ruling = self.classify(text)
            if ruling == label:
                results.append(1) 
            else:
                results.append(0) 
                
        print("Evaluated {} test cases. {:.2f}% Accuracy".format(len(results), 100.0*sum(results)/float(len(results))))
        return sum(results)/float(len(results))

In [32]:
unigram_bayes = ngrams_bayes(data,1)



In [0]:
unigram_bayes.train()

In [34]:
unigram_bayes.evaluate_test_data()

Evaluated 2 test cases. 50.00% Accuracy


0.5

In [49]:
results = []

for _ in range(10):
    unigram = ngrams_bayes(data, 1, 0.9)
    unigram.train()
    results.append(unigram.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))

Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Average Accuracy: 0.60




In [35]:
bigram_sms= ngrams_bayes(data,2) 
bigram_sms.train()
bigram_sms.evaluate_test_data()


Evaluated 2 test cases. 50.00% Accuracy




0.5

In [51]:
results = []
for _ in range(10):
    bigram_net = ngrams_bayes(data, 2, 0.9)
    bigram_net.train()
    results.append(bigram_net.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))

Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Average Accuracy: 0.40




In [36]:
trigram_sms = ngrams_bayes(data,3) 
trigram_sms.train()
trigram_sms.evaluate_test_data()

Evaluated 2 test cases. 50.00% Accuracy




0.5

In [53]:
results = []
for _ in range(10):
    trigram_net = ngrams_bayes(data, 3, 0.9)
    trigram_net.train()
    results.append(trigram_net.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))


Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 100.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Evaluated 1 test cases. 0.00% Accuracy
Average Accuracy: 0.30




#2pac and Biggie

In [0]:
url_data_biggie = "https://raw.githubusercontent.com/NoahLidell/math-of-intelligence/master/probability_theory/2pac_lyrics.csv"
biggie_df = pd.read_csv(url_data_biggie, usecols=[1], encoding='latin-1', header=None)
biggie_df.columns = ["lyrics"]
biggie_df["lyrics"] = biggie_df["lyrics"].str.replace('[^\w\s]','')
biggie_df["lyrics"] = biggie_df["lyrics"].str.lower()

In [38]:
biggie_df.tail()

Unnamed: 0,lyrics
11,i aint got no motherfucking friends\nthats why...
12,troublesome nigga\nhahaha troublesome 19mother...
13,change shit\ni guess change is good for any of...
14,i see no changes wake up in the morning and i ...
15,out on bail fresh out of jail california dream...


In [0]:
url_data_pac = "https://raw.githubusercontent.com/NoahLidell/math-of-intelligence/master/probability_theory/2pac_lyrics.csv"
pac_df = pd.read_csv(url_data_pac, usecols=[1], encoding='latin-1', header=None)
pac_df.columns = ["lyrics"]
pac_df["lyrics"] = pac_df["lyrics"].str.replace('[^\w\s]','')
pac_df["lyrics"] = pac_df["lyrics"].str.lower()

In [40]:
pac_df.head()

Unnamed: 0,lyrics
0,little something for my godson elijah\nand a l...
1,yo mo bee mayn drop that shit\nyou know what t...
2,rest in peace to my motherfucker biggy smallz\...
3,makaveli in this killuminati\nall through your...
4,its just me against the world\nnothin to lose\...


In [0]:
biggie_lyrics = biggie_df["lyrics"].values
biggie_lyrics = [ song.split('\n') for song in biggie_lyrics]
biggie_lyrics = [line for song in biggie_lyrics for line in song]
pac_lyrics = pac_df["lyrics"].values
pac_lyrics = [ song.split('\n') for song in pac_lyrics]
pac_lyrics = [line for song in pac_lyrics for line in song]

rap_lines = [] 

for line in biggie_lyrics:
    if len(line.split()) > 3:
        rap_lines.append(np.array([0,str(line)]))
        
for line in pac_lyrics:
    if len(line.split()) > 3:
        rap_lines.append(np.array([1,str(line)]))
        
rap_lines = np.array(rap_lines)


In [0]:
rap_lines = pd.DataFrame(rap_lines)
rap_lines.columns = ["label","line"]
rap_lines.head()
rap_lines['label'] = rap_lines['label'].replace(['0','1'],[0,1])

In [43]:
bayes_biggie_vs_pac = ngrams_bayes(rap_lines.values, 1, 0.9)



In [0]:
bayes_biggie_vs_pac.train() 

In [45]:
bayes_biggie_vs_pac.evaluate_test_data()

Evaluated 221 test cases. 9.50% Accuracy


0.09502262443438914

Since we have a small data set, let's run multiple trials with different train-test splits to get a better idea of what our average classification accuracy using this method.

In [46]:
results = []

for _ in range(10):
    unigram = ngrams_bayes(rap_lines.values, 1, 0.9)
    unigram.train()
    results.append(unigram.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))



Evaluated 221 test cases. 9.05% Accuracy
Evaluated 221 test cases. 5.43% Accuracy
Evaluated 221 test cases. 4.98% Accuracy
Evaluated 221 test cases. 6.79% Accuracy
Evaluated 221 test cases. 7.69% Accuracy
Evaluated 221 test cases. 9.50% Accuracy
Evaluated 221 test cases. 9.50% Accuracy
Evaluated 221 test cases. 5.88% Accuracy
Evaluated 221 test cases. 9.05% Accuracy
Evaluated 221 test cases. 10.41% Accuracy
Average Accuracy: 0.08


not bad, but how do bigram and trigram compare?

##### Bigram

In [47]:
results = []
for _ in range(10):
    bigram_net = ngrams_bayes(rap_lines.values, 2, 0.9)
    bigram_net.train()
    results.append(bigram_net.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))



Evaluated 221 test cases. 6.33% Accuracy
Evaluated 221 test cases. 5.43% Accuracy
Evaluated 221 test cases. 8.14% Accuracy
Evaluated 221 test cases. 5.88% Accuracy
Evaluated 221 test cases. 6.79% Accuracy
Evaluated 221 test cases. 4.98% Accuracy
Evaluated 221 test cases. 9.05% Accuracy
Evaluated 221 test cases. 6.79% Accuracy
Evaluated 221 test cases. 8.14% Accuracy
Evaluated 221 test cases. 4.07% Accuracy
Average Accuracy: 0.07


#### Trigram

In [52]:
results = []
for _ in range(10):
    trigram_net = ngrams_bayes(rap_lines.values, 3, 0.9)
    trigram_net.train()
    results.append(trigram_net.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))




Evaluated 221 test cases. 9.50% Accuracy
Evaluated 221 test cases. 6.33% Accuracy
Evaluated 221 test cases. 5.43% Accuracy
Evaluated 221 test cases. 5.88% Accuracy
Evaluated 221 test cases. 5.43% Accuracy
Evaluated 221 test cases. 4.52% Accuracy
Evaluated 221 test cases. 4.98% Accuracy
Evaluated 221 test cases. 6.79% Accuracy
Evaluated 221 test cases. 5.88% Accuracy
Evaluated 221 test cases. 6.33% Accuracy
Average Accuracy: 0.06



Unigrams seem to have a slight edge on bigrams and trigrams but using trigrams doesn't yeild horrible results on this rap data like it did when classing sms messages. My guess is that, while these raps contain highly colloquial words, there are phrases unique to Biggie and Pac that they use repeatedly while with sms messages the sequence of words is more arbitrary. 