In [38]:
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from os import getcwd
import re
import pickle
import string

In [39]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  
                word not in string.punctuation): 
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

def count_tweets(result, tweets, ys):
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1

    return result

def lookup(fregs, word, label):
    return fregs.get((word, label), 0)

def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0

    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs.get(pair, 0)
        else:
            N_neg += freqs.get(pair, 0)
    D = len(train_x)
    D_pos = sum(train_y)
    D_neg = D - D_pos
    logprior = np.log(D_pos / D_neg)
    for word in vocab:
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)

        p_w_pos = (freq_pos + 1)/(N_pos + V)
        p_w_neg = (freq_neg + 1)/(N_neg + V)

        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood

def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = process_tweet(tweet)
    p = logprior
    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]

    return p

def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    accuracy = 0

    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0

        y_hats.append(y_hat_i)

    error = sum(np.abs(y_hats - test_y)) / len(test_y)
    accuracy = 1 - error

    return accuracy

def get_naive_bayes_accuracy():
    print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))
    
def get_ratio(freqs, word):
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}
    pos_neg_ratio['positive'] = lookup(freqs, word, 1)
    pos_neg_ratio['negative'] = lookup(freqs, word, 0)
    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive'] + 1) / (pos_neg_ratio['negative'] + 1)
    return pos_neg_ratio

def get_words_by_threshold(freqs, label, threshold):
    word_list = {}
    for key in freqs.keys():
        word, _ = key
        pos_neg_ratio = get_ratio(freqs, word)
        if label == 1 and pos_neg_ratio['ratio'] >= threshold:
            word_list[word] = pos_neg_ratio
        elif label == 0 and pos_neg_ratio['ratio'] < threshold:
            word_list[word] = pos_neg_ratio

    return word_list

def save_model(name, model):
    with open(name, 'wb') as fid:
        pickle.dump(model, fid) 
        
def load_model(name):
    with open(name, 'rb') as fid:
        return pickle.load(fid)   

In [40]:
filePath = "./"
nltk.data.path.append(filePath)

all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [41]:
freqs = count_tweets({}, train_x, train_y)
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)

In [42]:
my_tweet = 'She smiled'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 1.5737795839220972


In [46]:
my_tweet = "slut slut slut slut slut"

p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

0.0


In [48]:
loglikelihood['slut']

KeyError: 'slut'

In [44]:
save_model("logprior.pkl", logprior)
save_model("loglikelihood.pkl", loglikelihood)

In [45]:
import requests
import json

text = "Żydzi powinni zginąć 4 lata temu w wybuchu anala"

url = 'https://translation.googleapis.com/language/translate/v2?key=AIzaSyAifjeunmPMgvz4ptXG9nJl29Wyk4GjnFk&target=en&source=pl&q='
url = url + text

x = requests.post(url, data = {})
json_data = json.loads(x.text)
translation = json_data['data']['translations'][0]['translatedText'];
print(translation)

Jews should have died four years ago in an anal eruption


In [31]:


json_data = json.loads(x.text)

In [32]:
json_data

{'data': {'translations': [{'translatedText': 'Let the negro carcass go to Africa'}]}}

In [36]:
json_data['data']['translations'][0]['translatedText']

'Let the negro carcass go to Africa'