In [1]:
import codecs
import CMUTweetTaggerWindows
import os

class TweetParser(object):
    def __init__ (self, tags_to_keep=[]):
        self.tags_to_keep = tags_to_keep
        self.tags_meaning = {"N":"common noun","O":"pronoun (personal/WH; not possessive)","^":"proper noun","S":"nominal + possessive(lady's)","Z":"proper noun + possessive (Palin's, Alaska's)","V":"verb incl. copula, auxiliaries","A":"adjective","R":"adverb","!":"interjection","D":"determiner","P":"pre- or postposition, or subordinating conjunction","&":"coordinating conjunction","T":"verb particle","X":"existential _there_, predeterminers","#":"hashtag (indicates topic/category for tweet)","@":"at-mention (indicates another user as a recipient of a tweet)","~":"discourse marker, indications of continuation of a message across multiple tweets","U":"URL or email address","E":"emoticon","$":"numeral",",":"punctuation","G":"other abbreviations, foreign words, possessive endings, symbols, garbage","L":"nominal + verbal (e.g. _i'm_), verbal + nominal (_let's_, _lemme_)","M":"proper noun + verbal","Y":" 'X' + verbal"}
        #self.tags_meaning = ""
    
    @property
    def tags_to_keep(self):
        return self.__tags_to_keep

    @tags_to_keep.setter
    def tags_to_keep(self, tags_to_keep):
        self.__tags_to_keep= tags_to_keep
        
    @property
    def tags_meaning(self):
        return self.__tags_meaning
    
    @tags_meaning.setter
    def tags_meaning(self, tags_meaning):
        self.__tags_meaning= tags_meaning
        
    
    def split_result(self,line):
        line = line.strip()  # remove '\n'
        if len(line) > 0:
            parts = line.split('\t')
            tokens = parts[0].split(" ")
            tags = parts[1].split(" ")
            confidence = parts[2].split(" ")
            res=[]
            for i in range(len(tokens)):
                res.append((tokens[i],tags[i],confidence[i]))
            return res

    def parse_tags_from_file(self,filename):
        import codecs
        with open (filename,"r") as f:
            tags = []
            for line in f:
                result = self.split_result(line)
                if self.tags_to_keep: #ho specificato una lista di tag: devo filtrare, tengo solo se tag è in tags_to_keep
                    result= [ tripletta for tripletta in result if tripletta[1] in self.tags_to_keep ]
                tags.append(result)
        return tags

    def tokenize_tweets(self,tweets):
        file_name = "temp-file.txt"
        o = codecs.open(file_name,'w','utf-8')
        texts = []
        for tweet in tweets:
            txt = tweet['text']
            txt = txt.replace('\n', ' ')
            texts.append(txt)
        message = "\n".join(texts)
        o.write(message)
        o.close()
        filename = CMUTweetTaggerWindows.runFile(file_name)
        tagged = self.parse_tags_from_file(filename)
        os.remove(file_name)
        os.remove(filename)
        return tagged
    
    def tokenize_tweets_from_file(self,file_name):
        filename = CMUTweetTaggerWindows.runFile(file_name)
        tagged = self.parse_tags_from_file(filename)
        os.remove(filename)
        return tagged

In [23]:
import pymongo
import codecs

client=pymongo.MongoClient()#senza parametri si connette a localhost.

db=client['inforet']#prendo il db, se non c'è lo crea

col = db['rumors_raw']
#with open ("rumors.txt", "a") as f:

file_name = "rumors.txt"
o = codecs.open(file_name,'w','utf-8')

tweets = []
for record in col.find({},{'id','rumor','text'} ):
    tweets.append(record)

tags_to_keep = [ 'N','^','S','Z','A','#','$']

#parser = TweetParser(tags_to_keep)
parser = TweetParser()

In [None]:
parser.tokenize_tweets_from_file("examples/example_tweets.txt")

parser.tags_meaning

parser.tags_to_keep = tags_to_keep

parser.tags_to_keep

parser.tokenize_tweets(tweets)

Sample output

[

[('Bolivian', '^', '0,9677'),
  ('news', 'N', '0,8951'),
  ('stills', 'N', '0,9608'),
  ('pics', 'N', '0,9922'),
  ('Air', '^', '0,9132'),
  ('France', '^', '0,9988'),
  ('disaster', 'N', '0,9832')],
  
  
 [('Air', '^', '0,6493'),
  ('France', '^', '0,9990'),
  ('jet', 'N', '0,9638'),
  ('crash', 'N', '0,8447'),
  ('Search', 'N', '0,8679'),
  ('teams', 'N', '0,9963'),
  ('black', 'A', '0,9503'),
  ('box', 'N', '0,9922'),
  ('signals', 'N', '0,5163')],
  
  
 [('News', 'N', '0,5745'),
  ('Outlets', 'N', '0,9900'),
  ('Shocking', 'A', '0,9778'),
  ('Air', '^', '0,7680'),
  ('France', '^', '0,9988'),
  ('Crash', 'N', '0,7106'),
  ('Pictures', 'N', '0,9732'),
  ('Stills', 'N', '0,6595')]
  
  ]

In [1]:
tags_meaning={
  "N":"common noun",
  "O":"pronoun (personal/WH; not possessive)",  
  "^":"proper noun",  
  "S":"nominal + possessive(lady's)",
  "Z":"proper noun + possessive (Palin's, Alaska's)",  

  "V":"verb incl. copula, auxiliaries",
  "A":"adjective",
  "R":"adverb",
  "!":"interjection",

  "D":"determiner",
  "P":"pre- or postposition, or subordinating conjunction",
  "&":"coordinating conjunction",
  "T":"verb particle",
  "X":"existential _there_, predeterminers",

  "#":"hashtag (indicates topic/category for tweet)", 
  "@":"at-mention (indicates another user as a recipient of a tweet)",
  "~":"discourse marker, indications of continuation of a message across multiple tweets",
  "U":"URL or email address",
  "E":"emoticon",
 
  "$":"numeral",
  ",":"punctuation",
  "G":"other abbreviations, foreign words, possessive endings, symbols, garbage",
 
  "L":"nominal + verbal (e.g. _i'm_), verbal + nominal (_let's_, _lemme_)",
  "M":"proper noun + verbal",
  "Y":" 'X' + verbal"
}

In [2]:
tags_to_keep = [ 'N','^','S','Z','A','#','$']

In [10]:
from prettytable import PrettyTable
x = PrettyTable()

x.field_names = ["Symbol","Meaning"]
for t in tags_meaning.items():
    #print t[0],t[1]
    #x.append()
    x.add_row([t[0],t[1]])
print x

+--------+-----------------------------------------------------------------------------------+
| Symbol |                                      Meaning                                      |
+--------+-----------------------------------------------------------------------------------+
|   !    |                                    interjection                                   |
|   #    |                    hashtag (indicates topic/category for tweet)                   |
|   $    |                                      numeral                                      |
|   &    |                              coordinating conjunction                             |
|   ,    |                                    punctuation                                    |
|   A    |                                     adjective                                     |
|   @    |           at-mention (indicates another user as a recipient of a tweet)           |
|   E    |                                      em

In [12]:
from prettytable import PrettyTable
x = PrettyTable()

x.field_names = ["Symbol","Meaning"]
for t in tags_meaning.items():
    if t[0] in tags_to_keep:
        x.add_row([t[0],t[1]])
print x

+--------+----------------------------------------------+
| Symbol |                   Meaning                    |
+--------+----------------------------------------------+
|   #    | hashtag (indicates topic/category for tweet) |
|   $    |                   numeral                    |
|   A    |                  adjective                   |
|   N    |                 common noun                  |
|   S    |         nominal + possessive(lady's)         |
|   Z    | proper noun + possessive (Palin's, Alaska's) |
|   ^    |                 proper noun                  |
+--------+----------------------------------------------+
