In [1]:
import string
import collections
from itertools import groupby

def load_tweets(path='data/TweetData.txt'):
    with open(path, encoding='ascii', errors='ignore') as csvfile:
        for row in csvfile:
            elements = row.split()
            if elements[-1] == 'en':
                yield ' '.join(elements[2:-1])

def load_abbreviations(path='data/Abbreviations.txt'):
    abbreviations = {}
    with open(path, encoding='ascii', errors='ignore') as csvfile:
        for row in csvfile:
            abbreviation = row.split(',')
            abbreviations[abbreviation[0].strip()] = abbreviation[1].strip()
    return abbreviations
                
def preprocess(nlp, tweets, abbreviations):
    hashtags = collections.Counter()
    unique_tweets = collections.Counter()
    VALID_CHARACTERS = string.ascii_letters + string.digits + "-.,;:!?'"
    
    for tweet in tweets:
        sentence = []
        
        tweet_chars = (c for c in tweet if c in string.printable)
        for word in (list(group) for k, group in groupby(tweet_chars, lambda x: x == " ") if not k):
            sentence.append(''.join(word))
        
        if sentence[0] == 'RT':
            sentence = sentence[2:]
        
        sentence = [word for word in sentence if word != '-' and not word.startswith('http')]
        
        start_sentence = 0
        for j, word in enumerate(sentence):
            start_sentence = j
            if not word.startswith('@') and not word.startswith('.'):
                break
                
        end_sentence = len(sentence) - 1
        for j, word in reversed(list(enumerate(sentence))):
            end_sentence = j
            if word.startswith('#'):
                tmp_hashtag = sentence[j][1:].lower()
                if tmp_hashtag in abbreviations:
                    tmp_hashtag = abbreviations[tmp_hashtag]
                hashtags.update(nlp(tmp_hashtag))
            else:
                break
        
        # Remove '#' and '...' at the end
        sentence = sentence[start_sentence:end_sentence + 1]
        if sentence[-1].endswith('...'):
            sentence = sentence[:-1]
        
        sentence = [word[1:].capitalize() if word.startswith('@') else word for word in sentence]
        
        unique_tweets.update([' '.join(
            (''.join([c for c in word if c in VALID_CHARACTERS]) for word in sentence)
        )])
    
    results = []
    for tweet, occurences in unique_tweets.items():
        results.append((nlp(tweet), occurences))
        
    return (results, hashtags)

In [2]:
import collections
import numpy as np

from sklearn.cluster import DBSCAN

class ConceptExtractor:
    
    class Sample:
        def __init__(self, word, vector):
            self.word = word
            self.vector = vector
        
        def __hash__(self):
            return hash(self.word)
    
        def __eq__(self, obj):
            return self.word == obj.word
    
        def __repr__(self):
            return self.word
        
    def __init__(self):
        self.samples = collections.Counter()
    
    def addSample(self, data, weight = 1):
        self.samples.update({
            (data if isinstance(data, ConceptExtractor.Sample) else ConceptExtractor.Sample(data.lemma_, data.vector)):
            weight
        })
    
    def extract(self, threshold=0.3, exclude=[]):
        fixed_map = [x for x in self.samples.items()]
        features = np.empty((len(fixed_map), len(fixed_map[0][0].vector)), dtype=np.float32)
        weights = np.empty((len(fixed_map), 1), dtype=np.uint32)                
        for i, (sample, occurences) in enumerate(fixed_map):
            features[i] = sample.vector
            weights[i] = occurences

        clusters = DBSCAN(eps=threshold).fit_predict(features, sample_weight=weights)

        interpretation = {}
        for i, cluster in enumerate(clusters):
            interpretation.setdefault(cluster, []).append(fixed_map[i])
        
        for e in exclude:
            del interpretation[e]

        results = collections.Counter()
        for _, data in interpretation.items():  
            score = sum(n for _, n in data)
            sortest = max(data, key = lambda t: t[1])
            results.update({sortest[0] : score})
        return results

In [3]:
import spacy

EXCLUDE_ENTITIES = ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

nlp = spacy.load('en')
concepts = ConceptExtractor()
abbreviations = load_abbreviations()
tweets, hashtags = preprocess(nlp, load_tweets(), abbreviations)

for hashtag, weight in hashtags.items():
    concepts.addSample(hashtag, weight)

for tweet in tweets:
    for entity in tweet[0].ents:
        if not entity.label_ in EXCLUDE_ENTITIES:
            lemma = entity.lemma_
            if lemma.endswith(" 's"):
                lemma = lemma[:-3]
            if lemma in abbreviations:
                lemma = abbreviations[lemma]
                
            concepts.addSample(ConceptExtractor.Sample(lemma, entity.vector), tweet[1])

concepts.extract(exclude=[-1, 0]).most_common(10)

[(manchester united, 1204),
 (arturo vidals, 1018),
 (new zealand, 682),
 (paul scholes, 147),
 (the worlds first climate change refugees, 118),
 (united, 88),
 (australian, 67),
 (james wilson, 46),
 (tuvalu, 43),
 (napoli, 37)]