In [1]:
from nltk import ne_chunk, pos_tag, word_tokenize

In [2]:
from nltk.tree import Tree
import re
from langdetect import detect_langs

In [3]:
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []

    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    if continuous_chunk:
        named_entity = " ".join(current_chunk)
        if named_entity not in continuous_chunk:
            continuous_chunk.append(named_entity)

    return continuous_chunk



In [4]:
def removeEmoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

In [5]:
def cleantext(text):
    res = ''
    for s in text.split():
        if  s.startswith('@') or  s.startswith('#') or s.startswith('RT@') or s=='RT' or s.startswith('http') : 
            continue
        else:   
            res +=  s + ' '
    return res

In [6]:
def isEnglish(string):
    try: 
        res = detect_langs(string)
    except Exception as e:
        return False
    for item in res:
        if item.lang == "en":
            return True
    return False

In [8]:
txt = "RT @VintageUV: Come on England. #worldcup #worldcup2018 #england #ourboys #vintage #russia #believe #comeonengland #football https://t.co/a…" 
txt = cleantext(removeEmoji(txt))
print(isEnglish(txt))

True


In [11]:
f = open('stream_text.txt','r',encoding='utf-8')
f.close()

In [12]:
tweets = list(open('stream_text_all.txt','r',encoding='utf-8').read().splitlines())
print(len(tweets))
tweets = list(set(tweets))
print(len(tweets))

21886
11627


In [13]:
all_nes =[]
for tw in tweets:
    data = cleantext(removeEmoji(tw))
    if(len(data)<3):
        continue
    nes = []
    if(isEnglish(data)):
        nes = get_continuous_chunks(data)
        all_nes.extend(nes)

In [14]:
from collections import Counter
x=Counter(all_nes)
x.most_common()

[('', 4316),
 ('Russia', 1047),
 ('FIFA', 723),
 ('England', 204),
 ('Saudi Arabia', 171),
 ('Happy', 109),
 ('New', 65),
 ('Brazil', 61),
 ('FREE', 56),
 ('Germany', 52),
 ('Good', 51),
 ('Moscow', 51),
 ('Spain', 51),
 ('Football', 47),
 ('Russian', 47),
 ('Argentina', 44),
 ('BETBRIGHT Offer Deposit', 43),
 ('BETBRIGHT Offer Double', 42),
 ('CORAL', 41),
 ('QUESTIONS CORRECTLY', 41),
 ('Which', 39),
 ('France', 37),
 ('Saudi', 37),
 ('Nigeria', 35),
 ('Portugal', 30),
 ('Fifa', 29),
 ('Iran', 26),
 ('Ready', 24),
 ('English', 24),
 ('Egypt', 23),
 ('Golden', 23),
 ('Messi', 23),
 ('Great', 22),
 ('UNIBET Offer Bet', 22),
 ('Bundle', 22),
 ('Super Eagles', 20),
 ('Belgium', 20),
 ('Bet', 19),
 ('Scotland', 18),
 ('Saudi Arabia New Customers', 17),
 ('Christmas', 15),
 ('African', 15),
 ('Africa', 15),
 ('Ball', 15),
 ('Watch', 15),
 ('RUSSIA', 15),
 ('Match', 14),
 ('Morocco', 14),
 ('Don', 13),
 ('Australia', 12),
 ('Happy World', 12),
 ('Robbie Williams', 12),
 ('LIVE', 12),
 ('WAT

In [24]:

print (get_continuous_chunks(txt))

['England', '']


In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nectec\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [8]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nectec\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [10]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Nectec\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.


True

In [12]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Nectec\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True