In [1]:
import os
import re
import json
import operator
import matplotlib.pyplot as plt
import warnings
import gensim
import numpy as np
import pyLDAvis.gensim
import pandas as pd
import numpy as np
import random
import nltk
import dill
nltk.download('stopwords')

from tqdm import tqdm_notebook as tqdm
from gensim.models import CoherenceModel, LsiModel, LdaModel, HdpModel
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
from gensim.utils import lemmatize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cdist
from nltk.stem.snowball import SnowballStemmer

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/giorgiaramponi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
warnings.filterwarnings('ignore')

In [4]:
# Preprocessing of the tweets
def process_texts(input_texts):

    texts = (re.sub((r"http\S+"), "", input_texts))
    # tokenize
    texts =gensim.utils.simple_tokenize(texts)
    # lower case
    texts = [word.lower() for word in texts]
    # remove stopwords
    texts = [word for word in texts if word not in stops]   

    italianStemmer=SnowballStemmer("italian", ignore_stopwords=True)
    texts = [italianStemmer.stem(word) for word in texts] 
    
    return texts

In [5]:
# Returns 2 dictionaries: 
# - users-texts
# - users-domain 
def load_text(path, domain):
    users_dict = {}
    domain_dict = {}
    dictionary = {}
    jsonFile = open(path, 'r')
    values = json.load(jsonFile)
    jsonFile.close()
    for i in values:
        for j in values[i]:
            # if the user is already in the dictionary we just concatenate the tweets
            if (values[i][j]['screen_name'] in users_dict):
                prev_text = users_dict.get(values[i][j]['screen_name'])
                users_dict[values[i][j]['screen_name']] = prev_text + values[i][j]['text'] + ' '
            # otherwise we create a new entry
            else:
                users_dict[values[i][j]['screen_name']] = values[i][j]['text'] + ' '


    for key, value in (users_dict.iteritems()):
        tmp = process_texts(value)
        pattern = re.compile("random*")
        if (pattern.match(domain)):   
            if (len(tmp)>=50):
                dictionary[key] = tmp
                domain_dict[key]=domain
        else:
            dictionary[key] = tmp
            domain_dict[key]=domain


    return dictionary, domain_dict

In [6]:
# - Updates the dictionary of the domain passed (as d)
# - Updates the dictionary with all users-texts (domain+random)
# - Updates the list of test users
# Returns the specific domain dictionary and the list of train users
def domain_setup(d):
    
    domain_specific_dictionary = {}
    domain_path = os.path.abspath("/Users/giorgiaramponi/Dropbox/gio/ske/tweets/"+ d +"_tweets.json")

    #Loads the user-texts dictionary and the users-domain dictionary
    domain_specific_dictionary, tmp = load_text(domain_path, d)
    
    users_domain_dict.update(tmp)
    users_texts_dict.update(domain_specific_dictionary)
    
    # In this case, all users are train users
    train_users = domain_specific_dictionary.keys()


    return domain_specific_dictionary, train_users

In [7]:
# Builds subset dictionary from existing dictionary and list of users
def build_dictionary(dictionary, train_u):
    new_dict = {}
    for u in train_u:
        new_dict[u] = dictionary[u]
    return new_dict

In [30]:
def graph_cv(x, c_v):
    
    plt.plot(x, c_v, label = "c_v")
    plt.suptitle("LDA model evaluation: c_v")
    plt.xlabel("Number of topics")
    plt.ylabel("Coherence score")
    plt.legend(loc='best')
    plt.savefig("Coherence_c_v.png")
    plt.show()

def graph_uci(x, c_uci):
    
    plt.plot(x, c_uci, label="c_uci")
    plt.suptitle("LDA model evaluation: c_uci")
    plt.xlabel("Number of topics")
    plt.ylabel("Coherence score")
    plt.legend(loc='best')
    plt.savefig("Coherence_c_uci.png")
    plt.show()
    
def graph_umass(x, u_mass):
    
    plt.plot(x, u_mass, label="u_mass")
    plt.suptitle("LDA model evaluation: u_mass")
    plt.xlabel("Number of topics")
    plt.ylabel("Coherence score")
    plt.legend(loc='best')
    plt.savefig("Coherence_u_mass.png")
    plt.show()
    
def graph_perplexity(x, perplexity):
    
    plt.plot(x, perplexity, label="log_perplexity")
    plt.suptitle("LDA model evaluation: perplexity")
    plt.xlabel("Number of topics")
    plt.ylabel("Perplexity")
    plt.legend(loc='best')
    plt.savefig("Perplexity.png")
    plt.show()
    

In [32]:
# Evaluates how the coherence changes according to the number of passes performed for the LDA
# Input: limit = maximum number of topics
def evaluate_num_topics(dictionary, corpus, texts, limit, passes, iterations, random_state):

    c_v = []
    c_uci = []
    u_mass = []
    perplexity = []
    lm_list = []
    for num_top in tqdm(range(1, limit)):
        lm = LdaModel(corpus=corpus, num_topics=num_top, id2word=dictionary,  \
                       passes=passes, iterations=iterations, random_state=random_state)
        lm_list.append(lm)
        cm_cv = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        cm_umass = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='u_mass')
        cm_uci = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_uci')
        c_v.append(cm_cv.get_coherence())
        c_uci.append(cm_uci.get_coherence())
        u_mass.append(cm_umass.get_coherence())
        perplexity.append(lm.log_perplexity(corpus))
                
    # Show graph
    x = range(1, limit)
    graph_cv(x, c_v)
    graph_uci(x, c_uci)
    graph_umass(x, u_mass)
    graph_perplexity(x, perplexity)
    
    return lm_list, c_v, c_uci, u_mass, perplexity

In [42]:
def print_topic_scores(num_top, lda_model, user_dict):
    
    print("Number of topics: "+str(num_top))
    for u, t in user_dict.iteritems():
        print("User: "+u+ " from "+str(users_domain_dict[u]))
        bow_vector = train_dictionary.doc2bow(t)
        for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
            print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
        print("------------------------")

In [43]:
def create_feature_vectors(num_top, lda_model, user_dict):
    
    feature_vectors = []
    for u, t in user_dict.iteritems():
        bow_vector = train_dictionary.doc2bow(t)
        fv = [0] * num_top
        for index, score in lda_model[bow_vector]:
            fv[index] = score
        feature_vectors.append((u, fv))
    
    return feature_vectors