In [1]:
import re
import string
import numpy as np

In [3]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [4]:
def process_tweet(tweet):
  """
    Process tweet function:
    Input:
      tweet: a string contain a tweet
    Output:
      tweets_clean: a list of words containing the processed tweets
  """

  stemmer= PorterStemmer()
  stopwords_english= stopwords.words('english')

  #remove stoch market tickers like $GE
  tweet= re.sub(r'\$\w*', '', tweet)

  #remove old style retweet text "RT"
  tweet= re.sub(r'^RT[\s]+', '', tweet)

  #remove hyperlinks
  tweet= re.sub(r'https?://[^\s\n\r]+', '', tweet)

  #remove hashtags
  #only removing the hash # sign from the word
  tweet= re.sub(r'#', '', tweet)

  #tokenize tweets
  tokenizer= TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
  tweet_tokens= tokenizer.tokenize(tweet)

  tweets_clean= []
  for word in tweet_tokens:
    if(word not in stopwords_english and #remove stopwords
       word not in string.punctuation): # remove puntuation
       #tweets_clean.append(word)
       stem_word= stemmer.stem(word) #stemming (root)
       tweets_clean.append(stem_word)

  return tweets_clean



In [6]:
def build_freqs(tweets, ys):
  """
    Buikd frequencies:
    Input:
      tweets: a list of tweets
      ys: an m x 1 array with sentiment label of each tweet (either 0 or 1)
    Output:
      freqs: a dictionary mapping each (word, sentiment) pair to its requency.
  """
  # convert np array to list since zip needs an iterable.
  # the squeeze is necessary or the list ends up with one element.
  # also this is just a NOP it ys is aleady a list.
  yslist= np.squeeze(ys).tolist()

  # start with an empty dictionary and populate it by looping over all tweets.
  # and over all processed words in each tweet.
  freqs= {}
  for y, tweet in zip(yslist, tweets):
    for word in process_tweet(tweet):
      pair = (word, y)
      if pair in freqs:
        freqs[pair] +=1
      else:
        freqs[pair] = 1

  return freqs
