In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import string
import re
import io
import csv

import numpy as np
from collections import Counter

from nltk.corpus import stopwords


## Stopwords 

We make use of nltk and combine the stopwords from English, French and German to build a larger list of stopwords which fits our dataset. (the 3 most used languages)

In [2]:
stopwords_3 = stopwords.words('english') + stopwords.words('french') + stopwords.words('german')

## Word Frequency Analysis

Use CountVectorizer from sklearn to remove tokenize, remove single letter words, remove punctuations, remove stopwords and count the frenquency of the most frequent words.

In [3]:
def wf_analyzer(filename):
    
    ngram_vectorizer = CountVectorizer(analyzer='word', stop_words=stopwords_3, ngram_range=(1, 1), min_df=1)

    with io.open(filename, 'r', encoding='utf8') as fin:
        X = ngram_vectorizer.fit_transform(fin)
        vocab = ngram_vectorizer.get_feature_names()
        counts = X.sum(axis=0).A1/X.sum()
        freq_distribution = Counter(dict(zip(vocab, counts)))
        return freq_distribution

In [4]:
# Analyse the extracted tweets
xx = wf_analyzer("sad_tweets.txt")
happy = wf_analyzer("happy_tweets.txt")

In [7]:
# most common / most frequent words used in the tweets. 
happy_4000 = happy.most_common(3000)
sad_4000 = xx.most_common(3000)

# Save lexicon to text file
happyfile = open('happy.txt', 'w')
sadfile = open('sad.txt', 'w')

for item in happy_4000:
  happyfile.write("%s\t%s\n" % (item[0], item[1]))

for item in sad_4000:
  sadfile.write("%s\t%s\n" % (item[0], item[1]))

In [8]:
happy_4000

[('lt', 0.023725393905637077),
 ('love', 0.005677414623220264),
 ('les', 0.0053803377211847792),
 ('follow', 0.0047952275425719379),
 ('ça', 0.0042579370821349497),
 ('merci', 0.0041914106866415991),
 ('si', 0.0039824162188626736),
 ('non', 0.0039265439574706785),
 ('bien', 0.0037166822927300154),
 ('please', 0.003496662035008989),
 ('di', 0.0032595456086136941),
 ('che', 0.0029606104273744846),
 ('oui', 0.0028693069758314684),
 ('va', 0.0027077605703876521),
 ('trop', 0.0026621707872562681),
 ('good', 0.0026025819674568235),
 ('bonne', 0.0025403915567721639),
 ('mi', 0.002518711632728707),
 ('plus', 0.0024053566013014891),
 ('bon', 0.0023919769910346702),
 ('haha', 0.0023063722623830772),
 ('fait', 0.0022910104876322848),
 ('gt', 0.0021671252073839592),
 ('aussi', 0.0021610548286517912),
 ('tout', 0.0021298357380292133),
 ('happy', 0.0020619466044531306),
 ('much', 0.0020551329140394729),
 ('amp', 0.0019809256311707256),
 ('ja', 0.0019804300900497325),
 ('aime', 0.0019576351984840405)

In [9]:
sad_4000

[('temp', 0.013703066734039954),
 ('non', 0.0071289233818790853),
 ('min', 0.0070547028103357017),
 ('max', 0.0069092604782203856),
 ('les', 0.0067908074448481178),
 ('ça', 0.005614523841296926),
 ('trop', 0.0052779073097516836),
 ('plus', 0.0049810250235781516),
 ('si', 0.0043220363126020532),
 ('genève', 0.0036308103836828697),
 ('fait', 0.0035888270300825724),
 ('météo', 0.0034418852924815312),
 ('mi', 0.0033046896905377016),
 ('che', 0.0031450030063079988),
 ('di', 0.0030962723280219392),
 ('follow', 0.0029380850492779615),
 ('tout', 0.0025774780299611206),
 ('va', 0.0025220000269892988),
 ('please', 0.002512253891332087),
 ('faire', 0.0024747687541889639),
 ('ch', 0.0024477794554459155),
 ('oh', 0.0023728091811596702),
 ('quand', 0.0023728091811596702),
 ('oui', 0.0023638127482453206),
 ('mal', 0.002158394196701008),
 ('vais', 0.0020841736251576248),
 ('bien', 0.0020676801648146512),
 ('sais', 0.0020421902715573276),
 ('comme', 0.0020219482975000411),
 ('aussi', 0.0019859625658426

## Remove Neutral word 

Assumption: 
1. Dataset size is big enough
2. Word frequency in both lexicon are accurate
3. Neutral words share the same/similar frequency count in both lexicons. (e.g. neutral word "Kitchen" have 0.005% and 0.0049% usage in both happy and sad lexicon)

Approach:


Subtract on lexicon from another and the frequency count neutral words should be removed (or minimised)
Take the top/bottom 2000/3000/4000 (choices) words after subtraction to build the happy/sad lexicon for later sentiment analysis usage.

In [129]:
# happy_4000.subtract(sad_4000)

happy.subtract(xx)

l = happy.most_common()[:-2000-1:-1] 

li = happy.most_common(2000)

In [130]:
sad_lexicon = []
happy_lexicon = []


for item in l:
    sad_lexicon.append(item[0])
 
for item in li:
    happy_lexicon.append(item[0])

In [131]:
from nltk import word_tokenize
from string import punctuation

In [141]:
# Count the sentiment score of each tweet in the entire dataset.
# Liu_Hu lexicon approach
# Positive word adds 1, negative word t
def sentiment_score(df):
    NaN_text = df.text.isnull()
    df = df[~ NaN_text]

    tweets = []

    for index, row in df.iterrows():
        
        score = 0
        
        # match url in string and replace with "" empty string
        t = re.sub(r"http\S+", " ", row['text'])
        # remove @username
        t = re.sub('@[^\s]+', " ", t)
        # remove hashtags
        t = re.sub(r'#(\w+)', " ",  t)

        tbl = str.maketrans({ord(ch):" " for ch in punctuation})
        
        tokens = t.translate(tbl).split()
            
        for it in tokens:
            if (it in happy_lexicon):
                score += 1
            if (it in sad_lexicon):
                score -= 1
        tweets.append((t, score))
        
    sentiment_file = open('sentiment_sample.txt', 'a')

    for item in tweets:
        sentiment_file.write("%s\t%s\n" % (item[0], item[1]))
        
    print("done")
    return 

In [142]:
# read schema 
schema_rawfile = pd.read_csv("twitter-swisscom/schema_home.txt", header=None, sep='\s+')
# extract schema columns
data_columns = schema_rawfile[1].values

col_list = [0, 3]

for data in pd.read_table(open("data_clean/twex.tsv", 'rU'),sep='\t',encoding='utf-8',escapechar="\\",na_values='N',quoting=csv.QUOTE_NONE, header=None, names=data_columns, chunksize=10000, engine='c', usecols=col_list):
    sentiment_score(data)



done


KeyboardInterrupt: 