In [1]:
import json
import itertools
import numpy as np
import collections
import pandas as pd
import networkx as nx
from operator import itemgetter
from collections import defaultdict

import preprocessor as p
from kneed import  KneeLocator
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from keras.models import model_from_json
from keras.preprocessing import sequence

from src.preprocessing import normalize,\
                              substitute_label_,\
                              replace_word_index_twitter_


# Lexicon polarity
data = json.load(open('outputfile.json'))
vocabolario_lexicon = json.load(open('data/lexicon_polarity.json'))
vocabolario_index_twitter = json.load(open('data/vocabolario_twitter.json'))


# Load Sentiment Analysis model
with open('src/model.json', 'r') as json_file:
    loaded_model_json = json_file.read()
    json_file.close()

loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("src/model.h5")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [2]:
""" Da aggiungere quando c'è il database: check, tramite id, se il tweet è già nel db e aggiornare il 
count dei retweet"""

class Tweet(object):
    """The class defines a tweet.

    Attributes:
    tweet_object: twitter streaming API object
    """

    def __init__(self,
                 tweet_object,
                 vocabolario_lexicon, 
                 vocabolario_index_twitter):

        #self.tweet_object = tweet_object
        self.is_a_retweet = self.is_a_retweet(tweet_object)
        self.tweet_text = self.get_text(tweet_object)
        self.id_tweet = self.get_id_tweet(tweet_object)
        self.id_retweet = self.get_id_retweet(tweet_object)
        self.num_retweet = self.get_number_retweets(tweet_object)
        self.list_hashtags = self.get_hashtag()
        self.data_tweet = self.get_date_tweet(tweet_object)
        self.data_retweet = self.get_date_retweet(tweet_object)
        self.user_tweet_id = self.get_user_tweet(tweet_object)
        #self.user_retweet_id = self.get_user_retweet(tweet_object)
        self.user_info = self.get_info_user_tweet(tweet_object)
        self.normalized_text = self._textNormalization(vocabolario_lexicon,
                                                        vocabolario_index_twitter) 
        self.padding = self._textPadding()
        self.sentiment = self.sentiment()
        self.changable_attributes = {'num_retweet': self.get_number_retweets(tweet_object),
                                     'list_user_retweet': []}
        
          
    def get_text(self, tweet_object):
        """Get text of tweet without preprocessing.

        :return: tweet's text
        """      
        if self.is_a_retweet:
            return tweet_object['retweeted_status']['text']

        return tweet_object['text']

    def get_cleaned_text(self):
        """Get tweet's content.

        :return: tweet's text
        """
        text = self.tweet_text
        clean_text = self.text_cleaning(text)
        return clean_text
        
    @staticmethod
    def text_cleaning(text_tweet):
        """Return text without url, emoji and mentions.

        :param text_tweet:
        :return:
        """
        p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION)
        clean_text = p.clean(text_tweet)
        return clean_text

    def get_hashtag(self):
        """Return the list of hashtags in the tweet.

        :return: list of hashtags in the tweet
        """

        tweet_text = self.get_cleaned_text()
        p.set_options(p.OPT.HASHTAG)
        parsed_tweet = p.parse(tweet_text)
        hashtags_ = parsed_tweet.hashtags
        if hashtags_ is None:
            return []

        list_hashtags = [i.match[1:].lower() for i in hashtags_]
        return list_hashtags

    def is_a_retweet(self, tweet_object):
        """Tell if the post is a retweet or not.

        :return:
        """
        try:
            assert tweet_object['retweeted_status']
            return True
        except KeyError:
            return False

    def get_id_tweet(self, tweet_object):
        """Return the id of the first tweet

        :return:
        """
        if self.is_a_retweet:
            return tweet_object['retweeted_status']['id']

        return tweet_object['id']

    def get_id_retweet(self, tweet_object):
        """Return the post id.

        :return:
        """
        return tweet_object['id']

    def get_number_retweets(self, tweet_object):
        """Number of retweet.

        :return:
        """
        #if self.is_a_retweet:
        #    return tweet_object['retweeted_status']['retweet_count']
        
        return tweet_object['retweet_count']
        
    
    def get_date_tweet(self, tweet_object):
        """Publication date of the tweet
        
        :return:
        """
        
        if self.is_a_retweet:
            return tweet_object['retweeted_status']['created_at']
        
        return tweet_object['created_at']

    def get_date_retweet(self, tweet_object):
        """Pub date of retweet
        
        :return:
        """
        
        return tweet_object['created_at']
    
    def get_user_tweet(self, tweet_object):
        """Return the user id that tweets
        
        :return:
        """
        
        if self.is_a_retweet:
            return tweet_object['retweeted_status']['user']['id']
        return tweet_object['user']['id']
    
    def get_info_user_tweet(self, tweet_object):
        """Return info of the user that tweets
        
        :return:
        """
        
        info = {}
        if self.is_a_retweet:
            user = tweet_object['retweeted_status']['user']
            info['name'] = user['name']
            info['followers_count'] = user['followers_count']
            return info
        
        user = tweet_object['user']
        info['name'] = user['name']
        info['followers_count'] = user['followers_count']
        return info
        
    def get_user_retweet(self, tweet_object):
        """Return the user id that retweets
        
        :return:
        """
        
        return tweet_object['user']['id']
    
            
    def sentiment(self):
        """Tell if the tweet is positive or negative
        
        :return:
        """
  
        if self._predictSentiment() == 0:
            return 'negative'
        
        return 'positive'

    
    def _textPadding(self):
        """Return the sequence of padded words
        
        :return:
        """
        max_length = 40
        pad_text = np.append(np.array(self.normalized_text),
                             np.array([0]*(max_length-len(self.normalized_text))))
        
        return pad_text
            
    def _textNormalization(self, vocabolario_lexicon, vocabolario_index_twitter):
        """Return the normalized text for padding.
        
        Keyword Arguments:
        """
        
        token_tweet = p.tokenize(self.tweet_text)
        split_normalize_tweet = normalize(token_tweet).split()
        replace_and_split_lexicon = (substitute_label_(split_normalize_tweet,\
                                                      vocabolario_lexicon)).split()
        to_pad = replace_word_index_twitter_(replace_and_split_lexicon,\
                                            vocabolario_index_twitter)
        
        return to_pad

    def _predictSentiment(self):
        """Return the sentimenti prediction for the Tweet
        
        :return:
        """

        return loaded_model.predict_classes(np.array([self._textPadding(),]))
    
    def _updateNumberRetweet(self, tweet_object):
        """Update attributes
        
        :return:"""
        
        self.changable_attributes['num_retweet'] = max(self.num_retweet,\
                                                       self.get_number_retweets(tweet_object))
        
    def _updateListUserRetweet(self, tweet_object):
        """Update lista degli utenti che hanno retwittato"""
        
        self.changable_attributes['list_user_retweet'] += [(self.get_user_retweet(tweet_object),\
                                                            self.get_date_retweet(tweet_object))]

In [3]:
class TweetCollection(object):
    """The class define the collection of tweets related to
    one hashtag.
    
    Attributes:
    """
    
    def __init__(self, hashtag, collection_totale):
        
        self.hashtag = hashtag
        self.collection = self.collezione(collection_totale)
        
    def collezione(self, collection_totale):
        """Return the list of objects in the collection
        
        :return:
        """
        
        collection = []
        for tweet in collection_totale:
            for hash_ in tweet.__dict__['list_hashtags']:
                if self.hashtag in hash_ and tweet not in collection:
                    collection += [tweet]
                    
        return collection

In [4]:
class Hashtag(object):
    """The class defines a hashtag object.

    Attribute:
    hashtag_occurrences_collection: occurence
                                    of the hashtag in the collection
    """

    def __init__(self, 
                 hashtag,
                 tweet_collection):
        """Return a hashtag object.

        
        """
        
        self.hashtag = hashtag
        self.lista_tweet = self.get_list_tweet(tweet_collection)
        self.lista_user = self.get_list_users()

    def get_list_tweet(self, tweet_collection):
        """Return the list of tweets that contain the hashtag
        
        :return:
        """
        
        collection = TweetCollection(self.hashtag,\
                                     tweet_collection).__dict__['collection']
        lista_tweet = []
        
        # Considero solo i singoli tweet
        for tweet in collection: 
            attr_tweet = tweet.__dict__
            lista_id_tweet = [attr_tweet['id_tweet']]
            lista_tweet += lista_id_tweet
            
        return collection, lista_tweet
    
    def get_list_users(self):
        """Return the list of users that tweet or
        retweet the hashtag
        
        :return:
        """
        
        list_users = []
        for tweet in self.lista_tweet[0]:
            tweet_attr = tweet.__dict__
            id_user_tweet = [tweet_attr['user_tweet_id']]
            id_user_retweet = tweet_attr['changable_attributes']['list_user_retweet']
            
            list_users += id_user_tweet + id_user_retweet
            
        return list_users

In [5]:
"""Simulazione dell'arrivo dei tweet"""
list_id_tweet = []
list_hashtags = []
collection_totale = []
edges_weight = defaultdict(int)

for tweet in data:
    
    object_tweet = Tweet(tweet, vocabolario_lexicon, vocabolario_index_twitter)
    tweet_attr = object_tweet.__dict__
    id_tweet = tweet_attr['id_tweet']
    list_hashtag = tweet_attr['list_hashtags']
    
    if id_tweet not in list_id_tweet:
        list_id_tweet += [id_tweet]
        collection_totale += [object_tweet]
        list_hashtags += list_hashtag
        choose_two_hashtag = list(itertools.combinations(list_hashtag, 2))
        for edge in choose_two_hashtag:
            edges_weight[tuple(sorted(edge))] += 1
        
    else:
        idx_object = list_id_tweet.index(id_tweet)
        object_to_mod = collection_totale[idx_object]
        object_to_mod._updateNumberRetweet(tweet)
        object_to_mod._updateListUserRetweet(tweet)
        collection_totale[idx_object] = object_to_mod
        
Graph = GraphHashtag(edges_weight, collection_totale, False)

# Hashtag

In [11]:
obj_hashtag = Hashtag('maremma', collection_totale)
hashtag = obj_hashtag.__dict__

lista_tweet = hashtag['lista_tweet'][0]
num_tweet = len(lista_tweet)
top_retweet = get_top_retweet(lista_tweet)
list_vector_pie = sentiment_percentage(lista_tweet)
list_unici_utenti = unique_cumulative_users(lista_tweet)
stream_neg = stream_tweet(lista_tweet, 'negative')
stream_pos = stream_tweet(lista_tweet, 'positive')
lista_diz_hash = co_occurrences(get_list_hashtags(lista_tweet))

# Topic

In [24]:
class GraphHashtag(object):
    """This class define the graph of hashtags.
    
    Attributes:
    """
    
    def __init__(self, edge_weights, tweet_collection, with_jaccard):
        self.with_jaccard = with_jaccard
        self.G = self.create_graph(edge_weights, tweet_collection)
        self.clusters = self.topic_content()
        self.tweet_clusters = self.tweet_cluster(tweet_collection)
        
    
    def create_graph(self, edge_weights, tweet_collection):
        """Return the graph of hashtags.
        
        :return:
        """
        
        G = nx.Graph()
        G.add_weighted_edges_from(self._defineEdges(edge_weights, tweet_collection, self.with_jaccard))
        
        return G
    
    def topic_content(self):
        """Return the list of hashtag in the topic
        
        :return:
        """
        
        partitions = self._graphPartitioning()
        
        hashtag_cluster = list(zip(self.G.nodes(),partitions.labels_))

        cluster_list_hashtag = defaultdict(list)
        for hash_, class_ in hashtag_cluster:
            cluster_list_hashtag[class_] += [hash_]
            
        return cluster_list_hashtag
    
    def tweet_cluster(self, tweet_collection):
        """Return the list of tweets per topic
        
        :return:
        """
        
        cluster_name = self._renameClusters(tweet_collection)
        hash_cluster = {hash_:cluster \
                for cluster,list_hash in self.clusters.items()\
                for hash_ in list_hash}
        
        tweet_cluster = {}

        for tweet in tweet_collection:
            tweet_attr = tweet.__dict__
            tweet_id = tweet_attr['id_tweet']
            list_clusters = [hash_cluster[h] for h in tweet_attr['list_hashtags'] if h in self.G.nodes()]
            tweet_cluster[tweet] = list(set(list_clusters))
        
        """Sono esclusi i tweet che non appartengono a nessun cluster"""
        cluster_tweet = defaultdict(list)
        for tweet, list_cluster in tweet_cluster.items():
            if len(list_cluster) > 0:
                for cluster in list_cluster:
                    cluster_tweet[cluster_name[cluster]] += [tweet]
                    
        return cluster_tweet
        
    
    def _adjacencyMatrixReduction(self):
        """Return the matrix projected in the lower dimensional 
        principal components subspace.
        
        :return:
        """
        
        adjacency_matrix = nx.to_numpy_matrix(self.G)
        X = adjacency_matrix
        pca = PCA()
        pca.fit(X)
        variance = pca.explained_variance_ratio_
        num_components = np.argmax(np.cumsum(variance)>.9)
        #print (num_components)

        X = adjacency_matrix
        pca = PCA(n_components=num_components,svd_solver = 'arpack' )
        pca.fit(X) 
        dimensionality_reduction = pca.fit_transform(X)
        
        return dimensionality_reduction
    
    def _graphPartitioning(self, max_k=50):
        """Give back the partitions.
        
        :return:
        """
        
        X = self._adjacencyMatrixReduction()
        distortions = []
        K = range(1,max_k)
        for k in K:
            kmeanModel = KMeans(n_clusters=k).fit(X)
            distortions += [sum(np.min(cdist(X, 
                                             kmeanModel.cluster_centers_, 
                                             'euclidean'), 
                                       axis=1))\
                            / X.shape[0]]

        number_partitions = KneeLocator(list(K), 
                                        distortions, 
                                        invert=False, 
                                        direction='decreasing')
        
        return KMeans(n_clusters=number_partitions.knee).fit(X)
    
    def _defineEdges(self, edge_weights, tweet_collection, jaccard=True):
        """Return the list of weighted edges.
        
        :return:
        
        Keyword Arguments
        :param:
        """
        
        edge_weights = {key:w for key, w in edge_weights.items() if w > 4}
        
        
        list_obj_hashtag = {}
        list_weighted_edges = []
        for h_1, h_2 in edge_weights.keys(): # Questa parte va integrata 
                                             # nel processo di ingestion 
                                             # del singolo tweet al fine
                                             # di snellire la computazione
            if h_1 not in list_obj_hashtag:
                hashtag_1 = len(Hashtag(h_1, tweet_collection)\
                                .__dict__['lista_tweet'][1])
                list_obj_hashtag[h_1] = hashtag_1
            else:
                hashtag_1 = list_obj_hashtag[h_1]

            if h_2 not in list_obj_hashtag:
                hashtag_2 = len(Hashtag(h_2, tweet_collection)\
                                .__dict__['lista_tweet'][1])
                list_obj_hashtag[h_2] = hashtag_2
            else:
                hashtag_2 = list_obj_hashtag[h_2]
                    
            if jaccard:
                list_weighted_edges += [(h_1, 
                                     h_2, 
                                     edge_weights[(h_1, h_2)]\
                                     /(hashtag_1 + hashtag_2))]
            else:
               
                
                list_weighted_edges += [(h_1, 
                                         h_2, 
                                         edge_weights[(h_1, h_2)])]
        
        return list_weighted_edges       
    
    def _defineNodes(self, edge_weights, tweet_collection):
        """Get the list of nodes
        
        :return:
        """
        
        list_nodes = []
        edges = self._defineEdges(edge_weights, tweet_collection, jaccard=self.with_jaccard)
        for h_1, h_2, w in edges:
            list_nodes += [h_1, h_2]
            
        return set(list_nodes)
    
    def _renameClusters(self, tweet_collection):
        """Map the cluster to a name corresponding to the 
        most occurrent word.
        
        :return:
        """
        
        name_cluster = {}
        for cluster, list_hash in self.clusters.items():
            list_occ = []
            for h in list_hash:
                list_occ += [len(Hashtag(h, tweet_collection).__dict__['lista_tweet'])]

            name_cluster[cluster] = list_hash[np.argmax(list_occ)]
            
        return name_cluster

In [25]:
class Topic(object):
    """The class defines the info of a topic.
    
    Attributes:
    
    """
    
    def __init__(self, tweet_clusters, topic):
        self.topic = topic
        self.tweet_topic = self.tweet_topic(tweet_clusters)
        
    def tweet_topic(self, tweet_clusters):
        """Get tweets in the topic.
        
        :return:
        """
        
        return tweet_clusters[self.topic]

In [28]:
topic = Topic(Graph.__dict__['tweet_clusters'], 'renzi')

lista_tweet = topic.__dict__['tweet_topic']
num_tweet = len(lista_tweet)
top_retweet = get_top_retweet(lista_tweet)
list_vector_pie = sentiment_percentage(lista_tweet)
list_unici_utenti = unique_cumulative_users(lista_tweet)
stream_neg = stream_tweet(lista_tweet, 'negative')
stream_pos = stream_tweet(lista_tweet, 'positive')
lista_diz_hash = co_occurrences(get_list_hashtags(lista_tweet))

In [6]:
def get_top_retweet(lista_tweet):
        """Return the list of top retweets
        
        :return:
        """
        
        list_num_retweet = []
        for tweet in lista_tweet:
            tweet_attr = tweet.__dict__
            num_retweet = tweet_attr['num_retweet']
            text_tweet = tweet.text_cleaning(tweet_attr['tweet_text'])
            user_info = tweet_attr['user_info']
            list_num_retweet += [(num_retweet,\
                                  text_tweet,\
                                  user_info)]
            
        sort_retweet = sorted(list_num_retweet,key=itemgetter(0), reverse=True)[:10]
        
        top_10_retweet = []
        for i, t in enumerate(sort_retweet):
            x = i+1
            y = t[0]
            label = t[1] + '\n' + 'Autore: ' + t[2]['name'] + '\n' \
                    + 'Followers: ' + str(t[2]['followers_count'])
                                           
            top_10_retweet += [{'x':x, 'y':y, 'label':label}]
            
        return top_10_retweet
    

def sentiment_percentage(lista_tweet):
        """Return the percentage of positive and 
        negative tweets.
        
        :return:
        """
        
        sentiment_tweet = []
        for tweet in lista_tweet:
            tweet_attr = tweet.__dict__
            sentiment_tweet += [tweet_attr['sentiment']]
            
        count_sentiment = collections.Counter(sentiment_tweet)
        total = len(sentiment_tweet)
        percentuali_sentiment = [{'x':1, 'y':round(count_sentiment['positive']/total*100,1)},
                                 {'x':2, 'y':round(count_sentiment['negative']/total*100,1)}]
            
        return percentuali_sentiment

In [7]:
def manipulate_date(lista_date):
    """Return the manipulate dates.

    :return:
    """

    ts = pd.to_datetime(lista_date)
    ts_list = []
    for time in ts:
        t = str(time)
        t_day = t[:10]
        t_hour = t[10:13] + ':00:00'
        t_rest = t[19:]

        ts_list += [t_day + t_hour + t_rest]

    return ts_list

def unique_cumulative_users(lista_tweet):
    """Return the cumulative sum of unique users.

    :return:
    """

    list_date_id = []
    for tweet in lista_tweet:
        tweet_attr = tweet.__dict__
        list_date_id += tweet_attr['changable_attributes']['list_user_retweet']


    lista_date = [j for i, j in list_date_id]
    ts_list = manipulate_date(lista_date)

    df = pd.DataFrame()
    df['Time'] = ts_list
    df['user'] = [i for i, j in list_date_id]
    df['counter'] = [1] * len(list_date_id)

    df.sort_values(by='Time', inplace=True)
    unique_count = df.drop_duplicates('user', keep='first')\
                     .groupby('Time')['counter']\
                     .sum()
    cumulative_unique_user = unique_count.cumsum()

    list_unici_utenti = []
    for i in cumulative_unique_user.index:
        list_unici_utenti += [{'x': str(i),
                               'y': int(cumulative_unique_user.loc[i])}]

    return list_unici_utenti


In [8]:
def stream_tweet(lista_tweet, sentimento='negative'):
    """Return the stram of positive/negative tweets

    :return:
    """

    list_date = []
    for tweet in lista_tweet:
        tweet_attr = tweet.__dict__
        if tweet_attr['sentiment'] == sentimento:
            list_date += [tweet_attr['data_retweet']]

    ts_list = manipulate_date(list_date)

    ts = pd.to_datetime(ts_list)
    df = pd.DataFrame()
    df['Time'] = ts
    df['freq'] = [1] * len(ts)

    grouped = df.groupby('Time').sum()
    list_hours = []
    for i in grouped.index:
        list_hours += [{'a': str(i), 'b': int(grouped.loc[i][0])}]

    return list_hours

In [9]:
def get_list_hashtags(lista_tweet):
    """Return the list of co-occurrent hashtags.

    :return:
    """

    list_hashtags = []
    for tweet in lista_tweet:
        tweet_attr = tweet.__dict__
        list_hashtags += tweet_attr['list_hashtags']

    return collections.Counter(list_hashtags).most_common(11)[1:]

In [10]:
def co_occurrences(lista_hashtag):
    """Return the top 10 co-occurrent hashtags

    :return:
    """

    lista_co_occ = []
    for i, hash_ in enumerate(lista_hashtag):
        lista_co_occ += [{'x': i + 1, 'y': hash_[1], 'label': '#' + hash_[0]}]

    return lista_co_occ

In [701]:
co_occurrences(get_list_hashtags(topic.__dict__['tweet_topic']))

[{'label': '#renzi', 'x': 1, 'y': 65},
 {'label': '#pd', 'x': 2, 'y': 60},
 {'label': '#elezionipolitiche2018', 'x': 3, 'y': 6},
 {'label': '#m5s', 'x': 4, 'y': 6},
 {'label': '#elezioni', 'x': 5, 'y': 6},
 {'label': '#firenze', 'x': 6, 'y': 6},
 {'label': '#maratonamentana', 'x': 7, 'y': 4},
 {'label': '#calenda', 'x': 8, 'y': 4},
 {'label': '#5stelle', 'x': 9, 'y': 4},
 {'label': '#grosseto', 'x': 10, 'y': 4}]

In [679]:
Graph = GraphHashtag(edges_weight, collection_totale, False)

In [681]:
Graph.__dict__['tweet_clusters']['renzi']

[<__main__.Tweet at 0x117347ba8>,
 <__main__.Tweet at 0x116342c18>,
 <__main__.Tweet at 0x116342fd0>,
 <__main__.Tweet at 0x116342d30>,
 <__main__.Tweet at 0x11631e6a0>,
 <__main__.Tweet at 0x11631e518>,
 <__main__.Tweet at 0x11631e208>,
 <__main__.Tweet at 0x11631e390>,
 <__main__.Tweet at 0x11631ebe0>,
 <__main__.Tweet at 0x11631ee10>,
 <__main__.Tweet at 0x11631ea20>,
 <__main__.Tweet at 0x116341518>,
 <__main__.Tweet at 0x1163414a8>,
 <__main__.Tweet at 0x116341278>,
 <__main__.Tweet at 0x116341c50>,
 <__main__.Tweet at 0x116341438>,
 <__main__.Tweet at 0x116341be0>,
 <__main__.Tweet at 0x116340780>,
 <__main__.Tweet at 0x116340710>,
 <__main__.Tweet at 0x116340be0>,
 <__main__.Tweet at 0x116340208>,
 <__main__.Tweet at 0x121f68748>,
 <__main__.Tweet at 0x1163405f8>,
 <__main__.Tweet at 0x116317f98>,
 <__main__.Tweet at 0x116317828>,
 <__main__.Tweet at 0x116317c50>,
 <__main__.Tweet at 0x116317eb8>,
 <__main__.Tweet at 0x116317fd0>,
 <__main__.Tweet at 0x1166139e8>,
 <__main__.Twe

In [659]:
clusters = Graph.__dict__['clusters']

In [None]:
tuples_weights = edges(dict_list_hashtag, occurrences=False, jaccard=True)
G = graph_hashtags(tuples_weights)

# Assign id
id_hash = {i: j for i, j in enumerate(list_hashtags)}
hash_id = {j: i for i, j in id_hash.items()}

dimensionality_reduct, num_components = dimensionality_reduction(G)
clusters = clustering(dimensionality_reduct)
class_hash, class_num_hash = create_cluster(G, clusters)

set_tweets_class, set_hash_class = tweet_in_class(class_hash, class_num_hash, dict_hashtag, hashtags_dict)
class_of_tweets, dict_tweet_prop_class, tweet_belongs_to = assign_tweet(list_tweet, hashtags_dict, class_num_hash,
                                                                        class_hash)

### Per prendere gli id dei tweet del topic devo matchare nome e numero topic
output = []
for cluster, list_hash in class_hash.items():
    dictionary = {}
    dictionary['topic'] = int(cluster)
    dict_ha = {i: count_hashtags[i] for i in list_hash}
    dictionary['hashtags'] = [(i, count_hashtags[i]) for i in sorted(dict_ha, key=dict_ha.get, reverse=True)]
    dictionary['number_tweets'] = len(set(class_of_tweets[cluster]))

    output += [dictionary]

dict_topic_hash = defaultdict(list)
for i in output:
    dict_topic_hash[i['topic']] += [j for j in i['hashtags']]

name_topic = {i: j[0][0] for i, j in dict_topic_hash.items()}
print (name_topic)
topic_nome = {j:i for i,j in name_topic.items()}

with open('web-ui/src/data/name_topic.js', 'w') as outfile:
    outfile.write('export default [')
    max_name = len(name_topic)
    for idx, t in enumerate(list(topic_nome.keys())):
        if idx != max_name - 1:
            outfile.write("'"+t+"'"+',\n')
        else:
            outfile.write("'"+t+"'" + ']')

In [None]:
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.plot(K, distortions, 'bx-')
plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')