In [31]:
import pandas as pd

In [32]:
! ls data/

train


# Get the csv file with our data and set it to a pandas data frame

In [33]:
train_data = pd.read_csv("data/train", sep="\t", names=["true_category", "message"])

In [34]:
training_data = pd.read_csv("data/train", sep="\t", names=["true_category", "message", "label_num"])

In [35]:
for i in range(len(training_data)):
    label = training_data['true_category'][i]
    if label == 'spam':
        training_data.loc[i,"label_num"]=int(1)
    else:
        training_data.loc[i,"label_num"]=int(0)

In [36]:
training_data.astype({'label_num': 'int64'}).dtypes

true_category    object
message          object
label_num         int64
dtype: object

In [37]:
training_data

Unnamed: 0,true_category,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0.0
1,ham,Ok lar... Joking wif u oni...,0.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1.0
3,ham,U dun say so early hor... U c already then say...,0.0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0.0
...,...,...,...
4569,ham,hiya hows it going in sunny africa? hope u r a...,0.0
4570,ham,At WHAT TIME should i come tomorrow,0.0
4571,spam,Wanna have a laugh? Try CHIT-CHAT on your mobi...,1.0
4572,ham,"CHA QUITEAMUZING THATSCOOL BABE,PROBPOP IN & ...",0.0


- add a new columns to `df` called `predicted_category`
- for each row in `df`, if `df["predicted_category"] == df["true_category"]` then you were correct!
- for example, if the number of columns that match is 4400, then the accuracy is 4400/4573

In [38]:
# import the functions ill be using
from version_two import train_func as set_v2
from version_two import test_func as alg_v2

# The variables spam, ham and len_df will be used to get the fraction of how many were correct

In [39]:
def set_data(algorithm, data):
    """
    This function uses the training data set to pass in the labels and 
    messages to another function which will create a dictionary that
    looks like this.
    dict = {
        'how': {'spam': 10, 'ham': 450},
        'free': {'spam': 154, 'ham': 50},
    }
    The point of this is to see what the probability of a word being in a
    spam labeled message and the word being in a ham labeled message. Once 
    we have that we can run our classifier algorithm. 
    """
    for i in range(len(train_data)):
        # get the message 
        sms = data['message'][i]
        # get real label
        label = data['true_category'][i]
        # call function
        algorithm(label, sms)

In [40]:
def get_f1_score(correct_ham, correct_spam, incorrect_ham, incorrect_spam):
    """
    """
    all_incorrect = (incorrect_ham+incorrect_spam)
    all_correct = (correct_ham+correct_spam)
    precision = (all_correct)/(all_correct+all_incorrect)
    recall = (all_correct)/(all_correct+all_incorrect)
    
    score = (2*((precision*recall)/(precision+recall)))
    precision_ham = (correct_ham)/(correct_ham+incorrect_ham)
    precision_spam = (correct_spam)/(correct_spam+incorrect_spam)
    recall_ham = (correct_ham)/(correct_ham+incorrect_ham) 
    recall_spam = (correct_spam)/(correct_spam+incorrect_spam) 

    
    ham_score = 2 * ((precision_ham * recall_ham)/(precision_ham+recall_ham))
    spam_score = 2 * ((precision_spam * recall_spam)/(precision_spam+recall_spam))
    
    print("f1: ", +score)
    print('all correct/2: ',+ (ham_score+spam_score)/2)

In [41]:
def set_as_spam(sms):
    train_func(sms, "spam")

In [42]:
def get_accuracy(algorithm, data):
    """
    This functon has to run after the set_data function and will determine
    the accuracy of the algorithm. 
    
    Since we now have our dictonary with the training data inputed in. We
    can now pass in the test data into our algorithm which will return 
    ham or spam.
    
    This function accepts an algorithm and a dataset and produces metrics
    which show how good the algorithm is. For instance, it prints the percentage
    of spam texts that were accurately identified as spam.
    
    https://en.wikipedia.org/wiki/F1_score
    RMSE
    """
    # ammount correct for each label
    correct_ham = 0
    correct_spam = 0
    correct = 0
    
    spam = len(data.groupby(['true_category']).get_group('spam'))
    ham = len(data.groupby(['true_category']).get_group('ham'))
    
    
    len_of_data = len(data)

    for i in range(len(data)):
        # get the message and label 
        sms = data['message'][i]
        label = data['true_category'][i]
        
        # call function
        prediction = algorithm(sms)

        
        if prediction == label:
            # If prediction is correct we add 1
            correct += 1
            if prediction == 'spam':
                # If the predicted label is spam and acutal label
                # is spam then we add to amount correct for spam
                correct_spam +=1
            elif prediction =='ham':
                # If the predicted label is ham and acutal label
                # is spam then we add to amount correct for ham
                correct_ham +=1
    
    incorrect_ham = ham-correct_ham
    incorrect_spam = spam-correct_spam
#     get_f1_score(correct_ham, correct_spam, incorrect_ham, incorrect_spam)
    print((correct_ham+correct_spam)/2)
    print('Correct Ham: ', + correct_ham/ham)
    print('Correct Spam: ', + correct_spam/spam)
    print('Correct: ', + correct/len_of_data)

    


In [43]:
set_data(set_v2, train_data)

In [44]:
get_accuracy(alg_v2, train_data)

2174.0
Correct Ham:  0.9992424242424243
Correct Spam:  0.6368078175895765
Correct:  0.9505902929602099


In [45]:
from version_three import train_func as set_v3
from version_three import driver_func as alg_v3

In [46]:
set_data(set_v3, train_data)

In [47]:
get_accuracy(alg_v3, train_data)

1901.5
Correct Ham:  0.8282828282828283
Correct Spam:  0.8517915309446255
Correct:  0.8314385658067337


In [48]:
from version_four import train_func as set_v4
from version_four import driver_func as alg_v4
from version_four import test_func as alg_v5

In [49]:
%matplotlib inline 
import matplotlib.pyplot as plt

In [50]:
set_data(set_v4, train_data)
ham_points = {}
spam_points = {}
def get_points():
    for i in range(len(train_data)):
        sms = train_data['message'][i]
        label = train_data['true_category'][i]
        ham, spam =  alg_v4(sms)
        if label == 'spam':
            spam_points[i] = {'x' : ham, 'y' : spam}
        else:
            ham_points[i] = {'x' : ham, 'y' : spam}  

In [51]:
get_points()

In [52]:
spam_x = []
spam_y = []
for key in spam_points:
    val = key
    x = spam_points[val]['x']
    y = spam_points[val]['y']
    spam_x.append(x)
    spam_y.append(y)

In [53]:
ham_x = []
ham_y = []
for key in ham_points:
    val = key
    x = ham_points[val]['x']
    y = ham_points[val]['y']
    ham_x.append(x)
    ham_y.append(y)

In [54]:
# %matplotlib inline 
# plt.scatter(spam_x, spam_y, color='r')
# plt.scatter(ham_x, ham_y, color='b')
# plt.xlabel('# Spam')
# plt.ylabel('# Ham')
# plt.grid(True)
# plt.show();

In [55]:
get_accuracy(alg_v5, train_data)

2264.5
Correct Ham:  0.9959595959595959
Correct Spam:  0.9527687296416938
Correct:  0.990161783996502


In [56]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [57]:
x = training_data.message

In [58]:
y = training_data.label_num

In [59]:
vectorizer = CountVectorizer()

In [60]:
counts = vectorizer.fit_transform(x.values)

In [61]:
classifier = MultinomialNB()
targets = y.values

In [65]:
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [67]:
example_count = vectorizer.transform(examples)
predictions = classifier.predict(example_count)
predictions

array([0., 0., 1., ..., 1., 0., 0.])

In [85]:
def get_acuraccy_naive_byes():
    for i in range(len(predictions)):
        correct = 0
        correct_spam = 0
        correct_ham = 0
        prediction = predictions[i]
        real_label = training_data['label_num'][i]
        spam = len(training_data.groupby(['true_category']).get_group('spam'))
        ham = len(training_data.groupby(['true_category']).get_group('ham'))
        len_of_data = len(training_data)
        if int(prediction) == int(real_label):
            # If prediction is correct we add 1
            correct += 1
            if int(prediction) == int(real_label):
                # If the predicted label is spam and acutal label
                # is spam then we add to amount correct for spam
                correct_spam +=1
            elif int(prediction) == int(real_label):
                 # If the predicted label is ham and acutal label
                # is spam then we add to amount correct for ham
                correct_ham +=1
        else:
            pass

    print((correct_ham+correct_spam)/2)
    print('Correct Ham: ', + correct_ham/ham)
    print('Correct Spam: ', + correct_spam/spam)
    print('Correct: ', + correct/len_of_data)

In [86]:
get_acuraccy_naive_byes()

0.5
Correct Ham:  0.0
Correct Spam:  0.0016286644951140066
Correct:  0.00021862702229995628


In [87]:
predictions

array([0., 0., 1., ..., 1., 0., 0.])