# Preprocess tweet data and output to couchdb

## load data from couchdb and initialise to py obj

In [1]:
import couchdb
# from collections import Counter

server = couchdb.Server('http://admin:123456@localhost:5984/')
db = server['total_tweets']

suburb_text_dict = {}
for doc_id in db:
    suburb = db[doc_id]['suburb']
    text = db[doc_id]['doc']['doc']['text']
    if suburb not in suburb_text_dict.keys():
        text_list = []
        suburb_text_dict.update({
            suburb: text_list
        })
    else:
        text_list = suburb_text_dict.get(suburb)
    text_list.append(text)

suburb_info_list = []
for suburb in suburb_text_dict.keys():
    text_list = suburb_text_dict.get(suburb)
#     print(suburb + ": " + str(len(text_list)))
    suburb_info_list.append({
        suburb: len(text_list)
    })


# processed_db = server.create('processed_data')
# for suburb_info in suburb_info_list:
#     processed_db.save(suburb_info)



## Preprocess data with nltk
Operations including tokenize twitter texts, lemmatize texts and load seed words

In [2]:
import nltk
nltk.download('stopwords')

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def pre_process(text) -> str:
    # lower cased
    text = text.lower()
    # tokenize
    words = tokenizer.tokenize(text)
    # check if word is alphabetic
    words = [w for w in words if w.isalpha()]
    # lemmatize 
    words = [lemmatize(w) for w in words]
    # remove stop words
#     stop_words = nltk.corpus.stopwords.words('english')
#     words = [w for w in words if not w in stop_words]
    # return result
    processed_comment = " ".join(words)
    return processed_comment

for key in suburb_text_dict.keys():
    text_list = suburb_text_dict.get(key)
    processed_text_list = []
    for text in text_list:
        processed_text_list.append(pre_process(text))
    suburb_text_dict.update({
        key: processed_text_list
    })


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wenbin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## load wordnet generated vocabularies and filter texts

In [3]:
food_words = []
with open("./FindWords/food_words.txt", 'r') as f:
    words = f.readlines()
    for word in words:
        word = word.split()[0]
        word = lemmatize(word)
        food_words.append(word)

def containKeyword(text) -> bool:
    for word in food_words:
        if word in text:
            return True
    return False
        
for key in suburb_text_dict.keys():
    text_list = suburb_text_dict[key]
    processed_text_list = [text for text in text_list if containKeyword(text)]
    suburb_text_dict.update({
        key: processed_text_list
    })

In [4]:
suburb_text_dict

{'Abbotsford': ['sunset goldenlight rail track bridge passingsun signal greenlight lightpost sky http t co',
  'sunset goldenlight rail track bridge signal greenlight lightpost sky crossroad http t co',
  'sunset goldenlight rail track bridge signal greenlight lightpost sky crossroad http t co',
  'come wrestle a wrangler absolutely anyone over of any fitness or skill level join the fun http t co',
  'birthday celebration in drink can you tell haha http t co',
  'amp b lovely atthesource drink a sun cat ipa by moondogbrewing moon dog brewery http t co photo',
  'the essendonvfl boy didn t get the win at vic park yesterday but my girl have fun http t co',
  'don t mess with jimmywantsapup axethrowing tourlyfe priscillaoz http t co',
  'last minute shop before claw be always perfect for the ever important http t co',
  'pre season skcc topbiketours http t co',
  'pic next to meg s as hbdcow iridoscyclitis collingwood http t co wzapddange',
  'real environmental law would force him to res

## Sentiment analysis by nltk vader sentiment analyser

In [5]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()

suburb_sentiment_dict = {}

for key in suburb_text_dict.keys():
    text_list = suburb_text_dict.get(key)
    if key not in suburb_sentiment_dict:
        sentiment_dict = {
            'pos': 0,
            'neg': 0,
            'neu': 0,
            'total': 0
        }
        suburb_sentiment_dict.update({
            key: sentiment_dict
        })
    else:
        sentiment_dict = suburb_sentiment_dict.get(key)
    
    for text in text_list:
        sentiment_results = analyzer.polarity_scores(text)
        neg_value = sentiment_results['neg']
        pos_value = sentiment_results['pos']
        if pos_value > neg_value:
            sentiment_dict['pos'] += 1
        elif pos_value < neg_value:
            sentiment_dict['neg'] += 1
        else:
            sentiment_dict['neu'] += 1
    sentiment_dict['total'] = sentiment_dict['pos'] + sentiment_dict['neg'] + sentiment_dict['neu']
    suburb_sentiment_dict.update({
        key: sentiment_dict
    })
    

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/wenbin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
suburb_sentiment_dict['Melbourne']

{'neg': 765, 'neu': 5107, 'pos': 6171, 'total': 12043}

In [7]:
suburb_sentiment_list = []
for suburb in suburb_sentiment_dict.keys():
    suburb_sentiment_list.append({
        suburb: suburb_sentiment_dict[suburb]
    })

processed_db = server.create('processed_data')
for suburb_info in suburb_sentiment_list:
    processed_db.save(suburb_info)
