In [83]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [93]:
app_freqs = pd.read_csv('data/appFreqs.csv', header=None, names=['word', 'count'])
other_freqs = pd.read_csv('data/otherFreqs.csv', header=None, names=['word','count'])
test = pd.read_csv('data/test.csv', header=None, names=['label','tweet'])

app_total = np.sum(app_freqs['count'])
other_total = np.sum(other_freqs['count'])

app_freqs['freq'] = np.log(appFreqs['count'] / app_total)
other_freqs['freq'] = np.log(otherFreqs['count'] / other_total)

In [88]:
def freq(word, df):
    try: 
        val = df.loc[df['word']==word, 'freq'].values.index[0]
    except AttributeError:
        val = 1/np.log(sum(df['count']))
    return val

In [102]:
def naive_bayes_mandrilla_classifier(test):
    
    def prob_tweet(tweet):
        words = tweet.split()
        print(words)
        app_prob = sum([freq(word, app_freqs) for word in words])
        other_prob = sum([freq(word, other_freqs) for word in words])
        
        return app_prob, other_prob
    
    def predictor(tweet):
        
        app_prob, other_prob = prob_tweet(tweet)
        #print(app_prob, other_prob)
        
        app_prior = len(app_freqs) / (len(app_freqs) + len(other_freqs))
        other_prior = len(other_freqs) / (len(app_freqs) + len(other_freqs))
    
        app_posterior = app_prob * app_prior
        other_posterior = other_prob * other_prior
       # print(app_posterior, other_posterior)
        
        if app_posterior > other_posterior:
            return 'APP'
        else:
            return 'OTHER'
        
    return test['tweet'].apply(lambda tweet: predictor(tweet))

In [103]:
preds = naive_bayes_mandrilla_classifier(test)
pd.crosstab(test['label'], preds)

['just', 'love', '@mandrillapp', 'transactional', 'email', 'service', '-', 'http://mandrill.com', 'sorry', '@sendgrid', 'and', '@mailjet', '#timetomoveon']
['@rossdeane', 'mind', 'submitting', 'a', 'request', 'at', 'http://help.mandrill.com', 'with', 'account', 'details', 'if', 'you', "haven't", 'already', 'glad', 'to', 'take', 'a', 'look']
['@veroapp', 'any', 'chance', "you'll", 'be', 'adding', 'mandrill', 'support', 'to', 'vero']
['@elie__', '@camj59', 'jparle', 'de', 'relai', 'smtp', '1', 'million', 'de', 'mail', 'chez', 'mandrill', '/', 'mois', 'comparé', 'à', '1', 'million', 'sur', 'lite', 'sendgrid', 'y', 'a', 'pas', 'photo', 'avec', 'mailjet']
['would', 'like', 'to', 'send', 'emails', 'for', 'welcome', 'password', 'resets', 'payment', 'notifications', 'etc', 'what', 'should', 'i', 'use', 'was', 'looking', 'at', 'mailgun/mandrill']
['from', 'coworker', 'about', 'using', 'mandrill', '"i', 'would', 'entrust', 'email', 'handling', 'to', 'a', 'pokemon".']
['@mandrill', 'realised', 'i

tweet,OTHER
label,Unnamed: 1_level_1
APP,10
OTHER,10
