In [16]:
import parse as p
import files as f
import pandas as pd

In [2]:
tr = f.TweetReader("../../1-Input/tweeter-dev-full-B.tsv", "B")

In [3]:
t = tr.__iter__().next()

In [4]:
t

{'sentiment': u'neutral',
 'sid': u'260097528899452929',
 'text': u"Won the match #getin . Plus, tomorrow is a very busy day, with Awareness Day's and debates. Gulp. Debates...",
 'uid': u'595739778'}

In [5]:
class Tweet(object):
    """Holds a tweet and the processed tokens"""
    def __init__(self, msg, sent=None, sid=0, uid=0):
        super(Tweet, self).__init__()

        # original message, cleaned message
        self.msg = msg
        self.msg_cleaned = None
        self.sent = sent
        self.sid = sid
        self.uid = uid

        # processed message and different tokens
        self.tokens = None
        self.bigrams = None
        self.trigrams = None
        self.ngrams = None
        self.non_contiguous = None
        
        # additional features
        self.all_caps_num = 0
        self.mentions_num = 0
        self.hash_num = 0
        self.pos_num_dic = None
        
    def __repr__(self):
        ret=  "msg: " + self.msg + "\n"
        if self.tokens:
            ret = ret + "\n\ntokens: " + "|".join(self.tokens)
        if self.bigrams:
            ret = ret + "\n\nbrigrams: " + "|".join(self.bigrams)
        if self.trigrams:
            ret = ret + "\n\ntrigrams: " + "|".join(self.trigrams)
        if self.non_contiguous:
            ret = ret + "\n\nnon_contiguous: " + "|".join(self.non_contiguous)
        return ret

    def process(self, token_type='informal'):
        """ 
        Creates tokens, brigrams, trigrams
        """
        # use ntlk to make the ngrams
        self.tokens = self._tokenize_clean(token_type)
        self.bigrams = [ " ".join(t)  for t in p.bigrams(self.tokens) ]
        self.trigrams = [ " ".join(t)  for t in p.trigrams(self.tokens) ]
        self.non_contiguous = [ " ".join(t)  for t in p.non_contiguous(self.trigrams) ]
        
    def _tokenize_clean(self, token_type):
        """
        Tokenize and clean the tweet
        """
        tokens = p.tokenize(self.msg, token_type)
        
        # count the original data
        self.all_caps_num = p.count_all_caps(tokens)
        self.mentions_num = p.count_mentions(tokens)
        self.hash_num = p.count_hash(tokens)
        #self.pos_num_dic = p.count_pos(self.msg) 
        
        # normalize data
        
        tokens = p.normalize_lower(tokens)
        tokens = p.normalize_mentions(tokens)
        tokens = p.normalize_hash(tokens)
        tokens = p.normalize_url(tokens)
        
        # save the normalized message
        self.msg_cleaned = " ".join(tokens)
        
        # create negated tokens
        
        # finally return the data
        return tokens
    
    def get_token_features(self):
        """
        Create a dictionary with all the words marked as true
        """
        features = {}
        for token in self.tokens:
            features[token] = True
            
        return features
    
    def get_stats_features(self):
        """
        Create a dictionary with the statistical features
        """
        features = {}
        features["STS_ALL_CAPS"] = self.all_caps_num
        features["STS_MENTIONS"] = self.mentions_num
        features["STS_HASH"] = self.hash_num
        
        return features
        
    def get_all_features(self):
        """
        Creates a single dictionary with all the features
        """
        features = {}
        features.update(self.get_token_features())
        features.update(self.get_stats_features())
        
        return features

In [6]:
tw = Tweet(t["text"])
tw.process()
print tw.get_all_features()
print t["text"]
print tw.get_stats_features()

{u'and': True, u"day's": True, u'tomorrow': True, u'...': True, u'very': True, u'is': True, '#HASH': True, u'with': True, u'day': True, 'STS_MENTIONS': 0, 'STS_HASH': 1, u'a': True, u'busy': True, u'debates': True, 'STS_ALL_CAPS': 0, u',': True, u'.': True, u'gulp': True, u'won': True, u'plus': True, u'match': True, u'the': True, u'awareness': True}
Won the match #getin . Plus, tomorrow is a very busy day, with Awareness Day's and debates. Gulp. Debates...
{'STS_ALL_CAPS': 0, 'STS_MENTIONS': 0, 'STS_HASH': 1}


In [7]:
all_tokens = []
for t in tr:
    tw =Tweet(t["text"], t["sentiment"], t["sid"], t["uid"])
    tw.process()
    print tw.msg, tw.sent
    print tw.msg_cleaned
    print anew.process_tweet(tw)
    print nhash.process_tweet(tw)

Won the match #getin . Plus, tomorrow is a very busy day, with Awareness Day's and debates. Gulp. Debates... neutral
won the match #HASH . plus , tomorrow is a very busy day , with awareness day's and debates . gulp . debates ...


NameError: name 'anew' is not defined

In [8]:
nhash.print_match()

NameError: name 'nhash' is not defined

In [9]:
anew.print_match()

NameError: name 'anew' is not defined

In [10]:
import json
from config import RESOURCES_DIR
from os.path import join
from utils import prefix_dict

class BaseLexicon(object):
    """
    Base class that holds the minimal information needed to properly
    deal with a tweet and save stats
    """
    def __init__(self, prefix, bigrams=False, trigrams=False, contiguous=False, radicals=False, opinion=True):
        # save what is contained in this lexicon
        self.prefix = prefix
        self.bigrams = bigrams
        self.trigrams = trigrams
        self.contiguous = contiguous
        self.radicals = radicals
        self.opinion = opinion
        
        # the lexicon info
        self.data = None
        self.df = None
        self.token_num = 0
        
        # stats info
        self.total_words = 0
        self.total_tweets = 0
        
        # general information on processed tokens
        self.total_bigrams = 0
        self.total_trigrams = 0
        self.total_contiguous = 0
        self.total_words = 0

        # match info
        self.total_tweets_match = 0
        self.total_words_match = 0
        self.total_bigrams_match = 0
        self.total_trigrams_match = 0
        self.total_contiguous_match = 0
        
    def _load_lexicon_json(self, file_name):
        """
        loads the preprocessed json
        """
        # construct path
        file_name = join(RESOURCES_DIR,
                         'processed',
                         file_name)
        
        # load the json
        with open(file_name, 'r')  as f:
            self.data = json.load(f)
        
        # fill other structures
        self.token_num = len(self.data.keys())
        self._create_df()
        
    def _create_df(self):
        """
        Creates a DF from the data. Prefix all names
        """
        self.df = pd.DataFrame.from_dict(self.data, orient='index')
        self.df.columns = [ self.prefix+"_"+c if c!=0 else self.prefix for c in self.df.columns ]
            
    def compare_lexicons(self, other):
        """
        Compares this lexicon COVERAGE with another one
        """
        # for now, just a simple difference
        data = other.data
        same = set(data.keys()).intersection(self.data.keys())
        diff = set(data.keys()).difference(self.data.keys())
        per_same = (len(same)*100.0)/self.token_num
        
        # check the r² between metrics
        
        if False:
            print "{}\t{}\t{}".format(self.prefix, other.prefix, "Common")
            print "{}\t{}\t{}\n".format(self.token_num, 
                                        len(data.keys()), 
                                        len(same))
        return {self.prefix:{other.prefix: per_same}}
    
    def correlate_lexicon(self, other):
        """
        Calculates the CORRELATION between 2 OPINION lexicons
        """
        if self.opinion and other.opinion:
            temp_df = self.df.join(other.df)
            temp_df.dropna(inplace=True)
            corr = temp_df.corr()

            return {corr.columns[0]: {corr.columns[1]: corr.iloc[0,1]}}
        else:
            return None
        
    def process_tweet(self, tweet):
        """
        Process a tweet
        Params:
            Tweet Class
        """
        # process the basic tokens
        sent_list = []
        for token in tweet.tokens:
            # get the value for this token
            ret = self.data.get(token, None)
            if ret:
                self.total_words_match += 1
                sent_list.append({token: ret})
        self.total_words += len(tweet.tokens)
        
        # process bigrams
        if self.bigrams:
            for token in tweet.bigrams:
                # get the value for this token
                ret = self.data.get(token, None)
                if ret:
                    self.total_bigrams_match += 1
                    sent_list.append({token: ret})
            self.total_bigrams += len(tweet.bigrams)

        # process trigrams
        if self.trigrams:
            for token in tweet.trigrams:
                # get the value for this token
                ret = self.data.get(token, None)
                if ret:
                    self.total_trigrams_match += 1
                    sent_list.append({token: ret})
            self.total_trigrams += len(tweet.trigrams)

        # process contiguous
        if self.contiguous:
            for token in tweet.contiguous:
                # get the value for this token
                ret = self.data.get(token, None)
                if ret:
                    self.total_contiguous_match += 1
                    sent_list.append({token: ret})
            self.total_contiguous += len(tweet.contiguous)

        
        self.total_tweets +=1
        
        # check if this tweet had any match
        if len(sent_list)>0:
            self.total_tweets_match += 1 
            
            # check if this an opinion lexicon
            if self.opinion:
                summ = 0
                print sent_list
                for sent in sent_list:
                    summ += sent.values()[0]

                return {'SUM_'+self.prefix: summ}
            else:
                # it is a sentiment lexicon. Sum the sentiments individually
                sum_dic = {}
                for ws in sent_list:
                    # sum the sentiment values of all words
                    for w, e in ws.iteritems():
                        print w,e
                        for k,v in e.iteritems():
                            sum_dic[k] = sum_dic.get(k, 0) + v
                
                # prefix the dictionary keys before returning it
                return prefix_dict(sum_dic, self.prefix+'_')
        else:
            # no term found in the dictionary
            return None
    
    def print_match(self):
        """
        Print the variables that are related with the match
        """
        # print the variables
        print "Total tweets %d" % self.total_tweets_match
        print "Total words %d" % self.total_words_match
        print "Total bigrams %d" % self.total_bigrams_match
        print "Total trigrams %d" % self.total_trigrams_match
        print "Total contiguous %d" % self.total_contiguous_match

In [14]:
for t in tr:
    tw = Tweet(t["text"])
    tw.process()
    print r.  wna.process_tweet(tw)

AttributeError: 'module' object has no attribute 'wna'

In [37]:
class AnewLexicon(BaseLexicon):
    """
    Holds the information for AnewLexicon
    """
    def __init__(self):
        super(AnewLexicon, self).__init__(prefix='ANEW', opinion=False)
        self._load_lexicon_json('anew.json')
        
anew = AnewLexicon()

class BingLexicon(BaseLexicon):
    """
    Holds Bing and Liu Lexicon Info
    """
    def __init__(self):
        super(BingLexicon, self).__init__(prefix='BING')
        self._load_lexicon_json('bing.json')
bing = BingLexicon()
        
class DALLexicon(BaseLexicon):
    """
    Holds the information for AnewLexicon
    """
    def __init__(self):
        super(DALLexicon, self).__init__(prefix='DAL', opinion=False)
        self._load_lexicon_json('dal.json')
        
dal = DALLexicon()

class MPQALexicon(BaseLexicon):
    """
    Holds Bing and Liu Lexicon Info
    """
    def __init__(self):
        super(MPQALexicon, self).__init__(prefix='MPQA')
        self._load_lexicon_json('mpqa.json')
mpqa = MPQALexicon()
        
class MSOLLexicon(BaseLexicon):
    """
    Holds Bing and Liu Lexicon Info
    """
    def __init__(self):
        super(MSOLLexicon, self).__init__(prefix='MSOL')
        self._load_lexicon_json('msol.json')
msol = MSOLLexicon()

class NRCHashLexicon(BaseLexicon):
    """
    Holds Bing and Liu Lexicon Info
    """
    def __init__(self):
        super(NRCHashLexicon, self).__init__(prefix='NRCHASH', bigrams=True)
        self._load_lexicon_json('nrc_hash.json')
nhash = NRCHashLexicon()

class Sent140Lexicon(BaseLexicon):
    """
    Holds Bing and Liu Lexicon Info
    """
    def __init__(self):
        super(Sent140Lexicon, self).__init__(prefix='SENT140', bigrams=True)
        self._load_lexicon_json('sent140.json')
s140 = Sent140Lexicon()

class SentiStrenghtLexicon(BaseLexicon):
    """
    Holds Bing and Liu Lexicon Info
    """
    def __init__(self):
        super(SentiStrenghtLexicon, self).__init__(prefix='SSTREN')
        self._load_lexicon_json('sentstrenght.json')
ss = SentiStrenghtLexicon()

class TSLexStrengthLexicon(BaseLexicon):
    """
    Holds Bing and Liu Lexicon Info
    """
    def __init__(self):
        super(TSLexStrengthLexicon, self).__init__(prefix='TSLEX')
        self._load_lexicon_json('ts_lex.json')
tslex = TSLexStrengthLexicon()

class WNALexicon(BaseLexicon):
    """
    Holds Bing and Liu Lexicon Info
    """
    def __init__(self):
        super(WNALexicon, self).__init__(prefix='WNA', opinion=False)
        self._load_lexicon_json('wna.json')
wna = WNALexicon()
lexs = [wna, tslex, ss, s140, nhash, msol, mpqa, dal, bing, anew]

In [13]:
def correlate(d1, d2):
    if d1.opinion and d2.opinion:
        temp_df = d1.df.join(d2.df)
        temp_df.dropna(inplace=True)
        corr = temp_df.corr()
        
        
        return {corr.columns[0]: {corr.columns[1]: corr.iloc[0,1]}}
        #keys = list(set(d1.data.keys()).intersection(d2.data.keys()))
        #print keys[0]
        #print d1.data[keys[0]]
        #print d2.data[keys[0]]
    else:
        return None
a =  correlate(ss, tslex)
pd.DataFrame(a)

Unnamed: 0,SSTREN
TSLEX,0.176277


In [11]:
import itertools
cover = {}
corr = {}

# make the comparison of the lexicons
for comb in itertools.permutations(lexs, 2):
    # check the common values between them
    t = (comb[0].compare_lexicons(comb[1]))
    
    # insert the returned dictionary as another key in the comparison
    temp_dict = cover.get(t.keys()[0], {})
    temp_dict.update(t.values()[0])
    cover[t.keys()[0]] = temp_dict
    
    # check how correlated they are
    res = comb[0].correlate_lexicon(comb[1])
    if res:
        # insert the returned correlation into the dict of comparisons
        temp_dict = corr.get(res.keys()[0], {})
        temp_dict.update(res.values()[0])
        corr[t.keys()[0]] = temp_dict

In [12]:
pd.DataFrame(cover)

Unnamed: 0,ANEW,BING,DAL,MPQA,MSOL,NRCHASH,SENT140,SSTREN,TSLEX,WNA
ANEW,,6.280407,7.159014,7.022721,1.281414,0.140255,0.076874,4.262542,0.324558,9.347181
BING,41.199226,,15.923999,78.16465,6.553665,0.488317,0.26783,28.253489,1.374041,51.632047
DAL,61.218569,20.757777,,28.135143,7.791885,1.105999,0.608929,13.127122,1.663656,22.997033
MPQA,46.034816,78.107032,21.567519,,7.47644,0.528643,0.28668,28.894757,1.270988,57.344214
MSOL,94.680851,73.816895,67.326397,84.272647,,2.04172,1.091646,36.325915,4.197341,78.041543
NRCHASH,92.166344,48.916409,84.992083,52.994984,18.158377,,28.169051,29.158808,22.049809,43.397626
SENT140,90.715667,48.179272,84.030762,51.608144,17.434555,50.584862,,29.498303,25.626462,43.026706
SSTREN,10.928433,11.042312,3.935761,11.301269,1.260471,0.113764,0.064089,,0.341734,7.344214
TSLEX,52.998066,34.203155,31.768831,31.661257,9.276178,5.479222,3.546128,21.765372,,31.602374
WNA,12.185687,10.260946,3.505994,11.404544,1.376963,0.086096,0.047534,3.73444,0.252302,


In [None]:
t