<!--NOTEBOOK_INFORMATION-->
<img id="r-1060983" data-claire-element-id="1061343" src="http://www.siteduzero.com/favicon.ico" alt="Image utilisateur">
    <p>
        **<font color='#D2691E'size="6">Tags recommendation (6/7)</font>**.
    </p>

<p>
    This notebook's goal is to compare the performances of the 3 unsupervised approaches (NMF, LDA and LDA+Word2Vec) used in this project.
</p>
<p>
    It is called in the notebook "6_Workflows_iteration.ipynb" in order to loop over it's hyperparameters
</p>

<p>
    <center>
        **<font color='	#D2691E'size="6">PLAN</font>**
    </center>
</p>

<p>
        **<font color='#D2691E'size="4">0) Libraries, functions and datasources import</font>**
</p>
<p>
        **<font color='#D2691E'size="4">I) Starting the program</font>**
</p>
<p>
        **<font color='#D2691E'size="4">II) Evaluation & Benchmark of the algorithms performances</font>**
</p>

In [1]:
print("Library, functions and datasources import")

Library, functions and datasources import


In [2]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from collections import Counter

In [3]:
from context import dir_path, datasources_path, enrichment_path, pickles_path, temp_files_path

In [4]:
from functions import aggregate_col_in_list

In [5]:
df_model = pickle.load(open(pickles_path+"df_model_unsupervised.p", "rb" ))

In [6]:
L_train_indexes = pickle.load(open(pickles_path+"L_train_indexes.p", "rb" ))
L_test_indexes = pickle.load(open(pickles_path+"L_test_indexes.p", "rb" ))

In [7]:
L_frequent_tags = pickle.load(open(pickles_path+"L_frequent_tags.p", "rb" ))

In [8]:
df_train = df_model[df_model['id'].isin(L_train_indexes)]
df_test = df_model[df_model['id'].isin(L_test_indexes)]

In [9]:
df_train_nmf = df_model[df_model['id'].isin(L_train_indexes)]
df_test_nmf = df_model[df_model['id'].isin(L_test_indexes)]

df_train_lda = df_model[df_model['id'].isin(L_train_indexes)]
df_test_lda = df_model[df_model['id'].isin(L_test_indexes)]

In [10]:
L_corpus = list(df_train_nmf['corpus'])

In [201]:
execution_params_workflows_iteration = pickle.load(open(pickles_path+"execution_params_workflows_iteration.p", "rb" ))
RUN = execution_params_workflows_iteration['RUN']
N_COMPONENTS = execution_params_workflows_iteration['N_COMPONENTS']
TOPICS_THRESHOLD = execution_params_workflows_iteration['TOPICS_THRESHOLD']
NEIGHBORS = execution_params_workflows_iteration['NEIGHBORS']
QUANTILE_THRESHOLD = execution_params_workflows_iteration['QUANTILE_THRESHOLD']

<p>
        **<font color='#D2691E'size="4">I) Starting the program</font>**
</p>

In [202]:
print("\n\n\nStarting run %s with params :\nN_COMPONENTS = %s \nTHRESHOLD = %s\nNEIGHBORS = %s\nQUANTILE_THRESHOLD = %s"%(RUN,N_COMPONENTS,TOPICS_THRESHOLD,NEIGHBORS,QUANTILE_THRESHOLD))




Starting run 1 with params :
N_COMPONENTS = 10 
THRESHOLD = 10
NEIGHBORS = 5
QUANTILE_THRESHOLD = 0.8


In [13]:
print("implementing NMF")

implementing NMF


In [14]:
from sklearn.decomposition import NMF

In [15]:
N_FEATURES = 1000
N_TOP_WORDS = 20

In [16]:
# Use count (raw term count) features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10,max_features=N_FEATURES)
start_time = time.time()
tfidf = tfidf_vectorizer.fit_transform(L_corpus)
print("--- %s seconds ---" % (time.time() - start_time))

Extracting tf-idf features for NMF...
--- 0.11967873573303223 seconds ---


In [17]:
print("Fitting the NMF model")
start_time = time.time()
nmf = NMF(n_components=N_COMPONENTS, random_state=0,alpha=.1, l1_ratio=.5).fit(tfidf)
print("--- %s seconds ---" % (time.time() - start_time))

print("\nTopics in NMF model (Frobenius norm):")
vectorizer_feature_names = tfidf_vectorizer.get_feature_names()

Fitting the NMF model
--- 0.1795194149017334 seconds ---

Topics in NMF model (Frobenius norm):


In [18]:
df_train_nmf_words_space_schema = pd.DataFrame(columns = vectorizer_feature_names)
df_train_nmf_words_space_schema
print("train_words_space_schema built!")

train_words_space_schema built!


In [19]:
print("implementing LDA")

implementing LDA


In [20]:
from sklearn.decomposition import LatentDirichletAllocation

In [21]:
print("Extracting count features for LDA...")
count_vectorizer = CountVectorizer(max_df=0.95, min_df=10, max_features=N_FEATURES)
start_time = time.time()
count = count_vectorizer.fit_transform(L_corpus)
print("--- %s seconds ---" % (time.time() - start_time))

Extracting count features for LDA...
--- 0.12566018104553223 seconds ---


In [22]:
print("Fitting the LDA model")
lda = LatentDirichletAllocation(n_components=N_COMPONENTS, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
start_time = time.time()
lda.fit(count)

print("--- %s seconds ---" % (time.time() - start_time))

Fitting the LDA model
--- 4.556853294372559 seconds ---


In [23]:
results_nmf = nmf.transform(tfidf)
results_lda = lda.transform(count)

In [24]:
L_main_topics_nmf = []
L_main_topics_lda = []
for sub_result_nmf,sub_result_lda in zip(results_nmf,results_lda):
    L_main_topics_nmf.append(np.argmax(sub_result_nmf))
    L_main_topics_lda.append(np.argmax(sub_result_lda))

In [25]:
df_corpus_topics_nmf = pd.DataFrame(results_nmf)
df_corpus_topics_lda = pd.DataFrame(results_lda)

df_corpus_topics_nmf['id'] = list(df_train_nmf['id'])
df_corpus_topics_lda['id'] = list(df_train_lda['id'])

df_corpus_topics_nmf['main_topic'] = L_main_topics_nmf
df_corpus_topics_lda['main_topic'] = L_main_topics_lda

df_corpus_topics_nmf['corpus'] = L_corpus
df_corpus_topics_lda['corpus'] = L_corpus

In [26]:
L_topics = []
for topic in range(N_COMPONENTS):
    L_topics.append(topic)

In [27]:
print("Topics associated to words management")

Topics associated to words management


In [28]:
df_topics_words_nmf = pd.DataFrame(nmf.components_, columns=vectorizer_feature_names)
df_topics_words_lda = pd.DataFrame(lda.components_, columns=vectorizer_feature_names)

In [29]:
print("words/topics normalization")
for column in df_topics_words_nmf.columns:
    column_sum = np.sum(df_topics_words_nmf[column])
    df_topics_words_nmf[column] = df_topics_words_nmf[column].apply(lambda x : x/column_sum)
    column_sum = np.sum(df_topics_words_lda[column])
    df_topics_words_lda[column] = df_topics_words_lda[column].apply(lambda x : x/column_sum)

words/topics normalization


In [30]:
print("Words associated to Topics management")

Words associated to Topics management


In [31]:
df_words_topics_nmf = df_topics_words_nmf.T
df_words_topics_lda = df_topics_words_lda.T

In [32]:
L_matching_tags = [word for word in df_words_topics_nmf.index if word in L_frequent_tags]
L_missing_tags = [word for word in L_frequent_tags if not word in L_matching_tags]

print("The models has %s tags coordinates in the topics space | %s tags are missing"%(len(L_matching_tags),len(L_missing_tags)))

The models has 119 tags coordinates in the topics space | 146 tags are missing


In [33]:
print("Tags associated to topics management")

Tags associated to topics management


In [34]:
print("Developping the train dataframe")
#We want to denormalize our entry dataframe thanks to the 'tag_list' column :
#We specify the excution params for  the python script responsible for the multiprocessing : 
execution_params = {'N_PROCS' : 6,
                    'dataframe':df_train[['tag_list','id']],
                   'column_to_develop':'tag_list'}
#We dump our execution params :
pickle.dump(execution_params,open(temp_files_path+"execution_params.p", "wb"))
#And we run the script responsible for the stopwords elimination on different process :  
%run -i multiprocessing_develop.py


#We load the result of the multiprocessing task : 
developped_dataframe = pickle.load(open(temp_files_path+"developped_dataframe.p", "rb" ))

#We clean the temp files dir :
import os
from functions import class_my_files
dict_extensions = class_my_files(temp_files_path)
pickles_temp_files  = dict_extensions['p']
for file_name in pickles_temp_files:
    os.remove(temp_files_path+file_name)

developped_dataframe.sort_values(by='id', inplace=True)
developped_dataframe.rename({'tag_list':'tag'}, axis=1, inplace=True)
print("train dataframe developped")

Developping the train dataframe
Multiprocessing
Creating pool with 6 processes

df_append
df_append
df_append
df_append
df_append
df_append
--- 4.623292446136475 seconds ---
the result file is available at : 
 temp_files_path+"developped_dataframe.p"
train dataframe developped


In [35]:
print("Join between the developped dataframe and the NMF and LDA corpus/topics distributions dataframes")

Join between the developped dataframe and the NMF and LDA corpus/topics distributions dataframes


In [36]:
#Now the dataframe is denormalized on the 'tag' column, we can compute the average topic distribution for each tag :
global_df_corpus_topics_nmf = pd.merge(developped_dataframe,df_corpus_topics_nmf,how='left',on='id')
global_df_corpus_topics_lda = pd.merge(developped_dataframe,df_corpus_topics_lda,how='left',on='id')

In [37]:
print("Aggregation")

Aggregation


In [38]:
#We first parameter the list  of columns we need for the aggregation :
L_tags_topics = L_topics.copy()
L_tags_topics.append('tag')

In [39]:
#And we aggregate the topics distributions by the dataframe's tags
df_tags_topics_nmf = global_df_corpus_topics_nmf[L_tags_topics].groupby('tag').mean()
df_tags_topics_lda = global_df_corpus_topics_lda[L_tags_topics].groupby('tag').mean()

In [40]:
print("words/topics normalization")
for tag in df_tags_topics_nmf.index:
    row_sum = np.sum(df_tags_topics_nmf.loc[tag,])
    df_tags_topics_nmf.loc[tag,] = df_tags_topics_nmf.loc[tag,]/row_sum
    row_sum = np.sum(df_tags_topics_lda.loc[tag,])
    df_tags_topics_lda.loc[tag,] = df_tags_topics_lda.loc[tag,]/row_sum

words/topics normalization


In [41]:
#Finally, the dataframe mapping all tags to their topics distribution is :
df_tags_topics_nmf = df_tags_topics_nmf.loc[L_missing_tags,].append(df_words_topics_nmf.loc[L_matching_tags,])
df_tags_topics_lda = df_tags_topics_lda.loc[L_missing_tags,].append(df_words_topics_lda.loc[L_matching_tags,])

In [42]:
#And we can now append the missing tags in the dataframe which maps all words to their topics : 
df_words_topics_nmf = df_words_topics_nmf.append(df_tags_topics_nmf.loc[L_missing_tags,])
df_words_topics_lda = df_words_topics_lda.append(df_tags_topics_lda.loc[L_missing_tags,])

In [43]:
print("Building the NMF word/tags similatities matrices")

Building the NMF word/tags similatities matrices


In [44]:
#With this two dataframes, we can compute the similarities between tags and words (with tags) in the topic's space:
#Because this space is sparse, we will choose manhattan distances
X_tags_topics_nmf = df_tags_topics_nmf.values
X_word_topics_nmf = df_words_topics_nmf.values
from sklearn.metrics.pairwise import manhattan_distances

df_nmf_tags_similarity = pd.DataFrame(manhattan_distances(X_tags_topics_nmf),columns=df_tags_topics_nmf.index, index=df_tags_topics_nmf.index)
df_nmf_words_similarity = pd.DataFrame(manhattan_distances(X_word_topics_nmf),columns=df_words_topics_nmf.index, index=df_words_topics_nmf.index)

In [45]:
print("Building the LDA word/tags similatities matrices")

Building the LDA word/tags similatities matrices


In [46]:
#With this two dataframes, we can compute the similarities between tags and words (with tags) in the topic's space:
X_tags_topics = df_tags_topics_lda.values
X_word_topics = df_tags_topics_lda.values
from sklearn.metrics.pairwise import cosine_similarity

df_lda_tags_similarity = pd.DataFrame(cosine_similarity(X_tags_topics),columns=df_tags_topics_lda.index, index=df_tags_topics_lda.index)
df_lda_words_similarity = pd.DataFrame(cosine_similarity(X_word_topics),columns=df_tags_topics_lda.index, index=df_tags_topics_lda.index)

In [49]:
print("Topics management : building the topics/tags and topics/words dict for NMF and LDA")

Topics management : building the topics/tags and topics/words dict for NMF and LDA


In [50]:
dict_topics_tags_nmf = {}
dict_topics_words_nmf = {}
dict_topics_tags_lda = {}
dict_topics_words_lda = {}

for topic in L_topics:
    L_heavy_tags = list(df_tags_topics_nmf[df_tags_topics_nmf[topic]>=TOPICS_THRESHOLD].index)
    dict_topics_tags_nmf[topic] = L_heavy_tags
    L_heavy_tags = list(df_tags_topics_lda[df_tags_topics_lda[topic]>=TOPICS_THRESHOLD].index)
    dict_topics_tags_lda[topic] = L_heavy_tags
    
    L_heavy_words = list(df_words_topics_nmf[df_words_topics_nmf[topic]>=TOPICS_THRESHOLD].index)
    dict_topics_words_nmf[topic] = L_heavy_words
    L_heavy_words = list(df_words_topics_lda[df_words_topics_lda[topic]>=TOPICS_THRESHOLD].index)
    dict_topics_words_lda[topic] = L_heavy_words

In [51]:
print("Word2Vec Implementation")

Word2Vec Implementation


In [52]:
df_train_word2vec = df_train_lda.copy()

In [53]:
start_time = time.time()
from gensim.models import word2vec
from nltk import word_tokenize

# tokenize sentences in corpus
corpus = df_train_word2vec['corpus'].copy()
tokenized_corpus = corpus.apply(word_tokenize)


# Set values for various parameters
feature_size = 100    # Word vector dimensionality  
window_context = 30          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size,
                              window=window_context, min_count=min_word_count,
                              sample=sample, iter=100,
                              workers=6)
print("--- %s seconds ---" % (time.time() - start_time))



--- 11.555360555648804 seconds ---


In [54]:
vectors = w2v_model.wv

In [55]:
L_tags_word2vec = [tag for tag in L_frequent_tags if tag in vectors]

In [56]:
L_word2vec_words = [word for word in vectors.vocab.keys()]

In [57]:
print("Building the Word2Vec word/tags similatities matrices")
start_time = time.time()

Building the Word2Vec word/tags similatities matrices


In [58]:
df_word2vec_words_coordinates = pd.DataFrame(columns = [i for i in range(0,100)])
for word in L_word2vec_words:
    df_word_coordinates = pd.DataFrame(vectors[word]).T
    df_word_coordinates.index = [word]
    df_word2vec_words_coordinates = df_word2vec_words_coordinates.append(df_word_coordinates)

In [59]:
df_word2vec_tags_coordinates = df_word2vec_words_coordinates.loc[L_tags_word2vec]

In [60]:
X_tags_word2vec = df_word2vec_tags_coordinates.values
X_words_word2vec = df_word2vec_words_coordinates.values

df_word2vec_tags_similarity = pd.DataFrame(cosine_similarity(X_tags_word2vec),\
                                           columns=df_word2vec_tags_coordinates.index,\
                                           index=df_word2vec_tags_coordinates.index)

df_word2vec_words_similarity = pd.DataFrame(cosine_similarity(X_words_word2vec),\
                                            columns=df_word2vec_words_coordinates.index,\
                                            index=df_word2vec_words_coordinates.index)
print("--- %s seconds ---" % (time.time() - start_time))

--- 4.142046689987183 seconds ---


In [61]:
print("Word2Vec moodel training on the pure corpus of tag lists")
start_time = time.time()

Word2Vec moodel training on the pure corpus of tag lists


In [62]:
tags_corpus = df_train[['tag_list']].copy()

In [63]:
print("Managing the tag list for the train")
dict_tags_replacement = pickle.load(open(pickles_path+"dict_tags_replacement.p", "rb" ))
reverse_dict_tags_replacement = {v: k for k, v in dict_tags_replacement.items()}

tags_corpus['raw_tag_list'] = tags_corpus['tag_list'].apply(lambda x : [reverse_dict_tags_replacement[tag] for tag in x])

for col in ['raw_tag_list','tag_list']:
    tags_corpus[col] = tags_corpus[col].astype(str)
tags_corpus['tag_list'] = tags_corpus['tag_list']+tags_corpus['raw_tag_list']
tags_corpus.drop('raw_tag_list', axis=1, inplace=True)

dict_replacement = {'[-\. ]':' ','[cC] {0,1}#':'csharp','[\[\]\',]':' ',' +':' '}
for pattern, replacement in dict_replacement.items():
    tags_corpus['tag_list'] = tags_corpus['tag_list'].str.replace(pattern,replacement)

tags_corpus['tag_list'] = tags_corpus['tag_list'].apply(word_tokenize)
tokenized_tags_corpus = list(tags_corpus['tag_list'])

Managing the tag list for the train


In [64]:
print("Starting the pure tags Word2Vec model")
feature_size = 50    # Word vector dimensionality  
window_context = 30          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

w2v_pure_tags_model = word2vec.Word2Vec(tokenized_tags_corpus, size=feature_size,
                              window=window_context, min_count=min_word_count,
                              sample=sample, iter=100,
                              workers=6)
pure_tags_vectors = w2v_pure_tags_model.wv

Starting the pure tags Word2Vec model


In [65]:
print("Starting the pure tags similarity matrix")
L_word2vec_pure_tags_words = [word for word in pure_tags_vectors.vocab.keys()]

df_word2vec_pure_tags_coordinates = pd.DataFrame(columns = [i for i in range(0,50)])
for word in L_word2vec_pure_tags_words:
    df_word_coordinates = pd.DataFrame(pure_tags_vectors[word]).T
    df_word_coordinates.index = [word]
    df_word2vec_pure_tags_coordinates = df_word2vec_pure_tags_coordinates.append(df_word_coordinates)

X_pure_tags_word2vec = df_word2vec_pure_tags_coordinates.values

df_word2vec_pure_tags_similarity = pd.DataFrame(cosine_similarity(X_pure_tags_word2vec),\
                                           columns=df_word2vec_pure_tags_coordinates.index,\
                                           index=df_word2vec_pure_tags_coordinates.index)
df_word2vec_pure_tags_similarity

Starting the pure tags similarity matrix


Unnamed: 0,csharp,winforms,net,datetime,math,linq,webservices,net35,web,services,...,projectmanagement,project,management,django,xcode,ssis,gcc,googlechrome,google,chrome
csharp,1.000000,0.566398,0.770303,0.093344,-0.227306,0.500244,0.313661,0.472975,0.245507,0.292474,...,0.064704,0.057824,0.069184,-0.179043,-0.431871,-0.097085,0.067722,-0.157474,-0.162377,-0.174793
winforms,0.566398,1.000000,0.446778,-0.008520,-0.147843,0.092387,0.138505,0.387025,0.049098,0.189758,...,0.089723,0.087352,0.083498,-0.228200,-0.236443,-0.011572,0.172127,-0.142357,-0.146068,-0.151426
net,0.770303,0.446778,1.000000,-0.051234,-0.302267,0.401373,0.408789,0.592367,0.280396,0.423699,...,-0.020307,-0.023723,-0.014385,-0.196501,-0.431221,-0.121647,-0.070277,-0.149643,-0.140268,-0.163680
datetime,0.093344,-0.008520,-0.051234,1.000000,0.194286,0.228790,-0.085031,-0.151194,0.169922,0.058921,...,-0.222443,-0.230598,-0.222648,0.089277,-0.119481,0.280952,-0.133267,-0.235564,-0.236106,-0.248325
math,-0.227306,-0.147843,-0.302267,0.194286,1.000000,0.000425,-0.096500,-0.316790,-0.126464,-0.099828,...,0.151793,0.147870,0.139799,0.044212,0.172691,0.021820,0.069797,0.170354,0.174301,0.183933
linq,0.500244,0.092387,0.401373,0.228790,0.000425,1.000000,0.222441,0.498243,0.186103,0.308208,...,0.002323,0.000239,0.004829,-0.134909,-0.158620,0.095513,-0.152909,-0.260557,-0.277694,-0.279071
webservices,0.313661,0.138505,0.408789,-0.085031,-0.096500,0.222441,1.000000,0.235499,0.527316,0.614818,...,-0.010690,-0.025874,-0.005767,-0.107428,-0.388371,-0.153298,-0.042258,-0.068114,-0.059132,-0.080950
net35,0.472975,0.387025,0.592367,-0.151194,-0.316790,0.498243,0.235499,1.000000,0.142621,0.207576,...,0.013630,0.000043,0.014790,-0.299642,-0.221236,-0.232364,-0.007241,-0.148847,-0.143669,-0.163694
web,0.245507,0.049098,0.280396,0.169922,-0.126464,0.186103,0.527316,0.142621,1.000000,0.643264,...,-0.199889,-0.214108,-0.194130,-0.035843,-0.375200,-0.111232,0.027515,0.062164,0.059372,0.043919
services,0.292474,0.189758,0.423699,0.058921,-0.099828,0.308208,0.614818,0.207576,0.643264,1.000000,...,0.015992,0.001959,0.023857,-0.147855,-0.316838,0.055615,0.005984,-0.047056,-0.057971,-0.065465


In [66]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 2.1539130210876465 seconds ---


In [67]:
print("Prediction on the test data : Building the test datasets")

Prediction on the test data : Building the test datasets


In [68]:
L_test_corpus = list(df_test['corpus'])

In [69]:
print("Extracting count features for NMF...")
test_tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10, max_features=N_FEATURES)

start_time = time.time()
test_tfidf = test_tfidf_vectorizer.fit_transform(L_test_corpus)
test_vectorizer_feature_names = test_tfidf_vectorizer.get_feature_names()
test_tfidf_array = test_tfidf.toarray()
df_all_test_words_nmf = pd.DataFrame(test_tfidf_array, columns=test_vectorizer_feature_names)
print("--- %s seconds ---" % (time.time() - start_time))

Extracting count features for NMF...
--- 0.05285215377807617 seconds ---


In [70]:
print("Extracting count features for lda...")
test_tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10, max_features=N_FEATURES)

start_time = time.time()
test_tfidf = test_tfidf_vectorizer.fit_transform(L_test_corpus)
test_vectorizer_feature_names = test_tfidf_vectorizer.get_feature_names()
test_tfidf_array = test_tfidf.toarray()
df_all_test_words_lda = pd.DataFrame(test_tfidf_array, columns=test_vectorizer_feature_names)
print("--- %s seconds ---" % (time.time() - start_time))

Extracting count features for lda...
--- 0.0498659610748291 seconds ---


In [71]:
#We get back the train columns schema :
df_final_test_words = df_train_nmf_words_space_schema.copy()

In [72]:
#The original shape is :
test_words_dim = df_all_test_words_nmf.shape[1]
print("Original test space words dimension is %s"%test_words_dim)
#But we have to filter the data onto the train space perimeter
df_all_test_words_nmf = df_all_test_words_nmf[[col for col in df_all_test_words_nmf.columns if col in list(df_final_test_words.columns)]]
df_all_test_words_lda = df_all_test_words_lda[[col for col in df_all_test_words_lda.columns if col in list(df_final_test_words.columns)]]
test_words_dim = df_all_test_words_nmf.shape[1]
print("After filtering on the train columns perimeter, test space words dimension is %s"%test_words_dim)

Original test space words dimension is 875
After filtering on the train columns perimeter, test space words dimension is 795


In [73]:
#Finally, the test data looks like : 
df_final_test_words_nmf = df_final_test_words.append(df_all_test_words_nmf)
df_final_test_words_lda = df_final_test_words.append(df_all_test_words_lda)
df_final_test_words_nmf.fillna(0, inplace=True)
df_final_test_words_lda.fillna(0, inplace=True)

In [74]:
test_matrix_nmf = df_final_test_words_nmf.values
test_matrix_lda = df_final_test_words_lda.values

In [75]:
print("Applying NMF and LDA on the test data")
test_results_nmf = nmf.transform(test_matrix_nmf)
test_results_lda = lda.transform(test_matrix_lda)

Applying NMF and LDA on the test data


In [76]:
print("Extracting the main topics")
L_test_main_topics_nmf = []
L_test_main_topics_lda = []
for sub_result_nmf, sub_result_lda in zip(test_results_nmf, test_results_lda):
    L_test_main_topics_nmf.append(np.argmax(sub_result_nmf))
    L_test_main_topics_lda.append(np.argmax(sub_result_lda))

Extracting the main topics


In [77]:
df_test_nmf_corpus_topics = pd.DataFrame(test_results_nmf)
df_test_lda_corpus_topics = pd.DataFrame(test_results_lda)

df_test_nmf_corpus_topics['id'] = list(df_test_nmf['id'])
df_test_lda_corpus_topics['id'] = list(df_test_lda['id'])

df_test_nmf_corpus_topics['main_topic'] = L_test_main_topics_nmf
df_test_lda_corpus_topics['main_topic'] = L_test_main_topics_lda

In [78]:
df_test_nmf = pd.merge(df_test_nmf,df_test_nmf_corpus_topics,how='left',on='id')
df_test_lda = pd.merge(df_test_lda,df_test_lda_corpus_topics,how='left',on='id')

In [79]:
print("Tags generation on the test perimeter")

Tags generation on the test perimeter


In [80]:
df_test_nmf['nmf_tags'] = df_test_nmf['main_topic'].apply(lambda x : dict_topics_tags_nmf[x])
df_test_lda['lda_tags'] = df_test_lda['main_topic'].apply(lambda x : dict_topics_tags_lda[x])

In [81]:
print("Generating nmf recommended tags")
#We specify the excution params for  the python script responsible for the multiprocessing : 
execution_params = {'N_PROCS' : 6,
                    'list_of_corpus':list(df_test_nmf['corpus']),
                    'nmf_word_sim_matrix':df_nmf_words_similarity,
                    'nmf_tag_sim_matrix':df_nmf_tags_similarity,
                    'existing_tags':L_frequent_tags,
                    'NEIGHBORS':NEIGHBORS,
                    'recommendation_filter':list(df_test_nmf['nmf_tags']),
                    'QUANTILE_THRESHOLD':QUANTILE_THRESHOLD,
                   }
#We dump our execution params :
pickle.dump(execution_params,open(temp_files_path+"execution_params.p", "wb"))
#And we run the script responsible for the tags generation on different process :  
%run -i multiprocessing_generate_tags_nmf.py


#Now we can load the result of the multiprocessing task : 
L_recommended_tags_nmf = pickle.load(open(temp_files_path+"L_recommended_tags.p", "rb" ))



#And we can clean the temp files dir :
import os
from functions import class_my_files
dict_extensions = class_my_files(temp_files_path)
pickles_temp_files  = dict_extensions['p']
for file_name in pickles_temp_files:
    os.remove(temp_files_path+file_name)

#And we assign the result of the multiprocessing :
df_test_nmf['recommendation_nmf'] = L_recommended_tags_nmf

Generating nmf recommended tags
Multiprocessing
Creating pool with 6 processes

TASK ready
results ready
final_results ready
appending sub lists :
success
--- 8.86453890800476 seconds ---
the result file is available at : 
 temp_files_path+"L_recommended_tags.p"


In [82]:
print("Generating lda recommended tags")
#We specify the excution params for  the python script responsible for the multiprocessing : 
execution_params = {'N_PROCS' : 6,
                    'list_of_corpus':list(df_test_lda['corpus']),
                    'lda_word_sim_matrix':df_lda_words_similarity,
                    'lda_tag_sim_matrix':df_lda_tags_similarity,
                    'existing_tags':L_frequent_tags,
                    'NEIGHBORS':NEIGHBORS,
                    'recommendation_filter':list(df_test_lda['lda_tags']),
                    'QUANTILE_THRESHOLD':QUANTILE_THRESHOLD,
                   }
#We dump our execution params :
pickle.dump(execution_params,open(temp_files_path+"execution_params.p", "wb"))
#And we run the script responsible for the tags generation on different process :  
%run -i multiprocessing_generate_tags_lda.py


#Now we can load the result of the multiprocessing task : 
L_recommended_tags_lda = pickle.load(open(temp_files_path+"L_recommended_tags.p", "rb" ))



#And we can clean the temp files dir :
import os
from functions import class_my_files
dict_extensions = class_my_files(temp_files_path)
pickles_temp_files  = dict_extensions['p']
for file_name in pickles_temp_files:
    os.remove(temp_files_path+file_name)

#And we assign the result of the multiprocessing :
df_test_lda['recommendation_lda'] = L_recommended_tags_lda

Generating lda recommended tags
OK
OK
Multiprocessing
Creating pool with 6 processes

TASK ready
results ready
final_results ready
appending sub lists :
success
--- 2.619110584259033 seconds ---
the result file is available at : 
 temp_files_path+"L_recommended_tags.p"


In [104]:
print("Generating lda+vord2vec recommended tags")

df_test_lda_word2vec = df_test_lda.copy()

#We specify the excution params for  the python script responsible for the multiprocessing : 
execution_params = {'N_PROCS' : 6,
                    'list_of_corpus':list(df_test_lda_word2vec['corpus']),
                    'lda_word_sim_matrix':df_lda_words_similarity,
                    'lda_tag_sim_matrix':df_lda_tags_similarity,
                    
                    'w2v_word_sim_matrix':df_word2vec_words_similarity,
                    'w2v_tag_sim_matrix':df_word2vec_tags_similarity,
                    'w2v_pure_tag_sim_matrix':df_word2vec_pure_tags_similarity,
                    
                    'existing_tags':L_frequent_tags,
                    'NEIGHBORS':NEIGHBORS,
                    'recommendation_filter':list(df_test_lda_word2vec['lda_tags']),
                    'QUANTILE_THRESHOLD':QUANTILE_THRESHOLD,
                   }
#We dump our execution params :
pickle.dump(execution_params,open(temp_files_path+"execution_params.p", "wb"))
#And we run the script responsible for the tags generation on different process :  
%run -i multiprocessing_generate_tags_lda_word2vec.py


#Now we can load the result of the multiprocessing task : 
L_recommended_tags_lda_word2vec = pickle.load(open(temp_files_path+"L_recommended_tags.p", "rb" ))



#And we can clean the temp files dir :
import os
from functions import class_my_files
dict_extensions = class_my_files(temp_files_path)
pickles_temp_files  = dict_extensions['p']
for file_name in pickles_temp_files:
    os.remove(temp_files_path+file_name)

#And we assign the result of the multiprocessing :
df_test_lda_word2vec['recommendation_lda_w2v'] = L_recommended_tags_lda_word2vec

Generating lda+vord2vec recommended tags
OK
OK
Multiprocessing
Creating pool with 6 processes

TASK ready
results ready
final_results ready
appending sub lists :
success
--- 33.764545917510986 seconds ---
the result file is available at : 
 temp_files_path+"L_recommended_tags.p"


In [105]:
print("Evaluation on the test perimeter")

Evaluation on the test perimeter


In [164]:
df_test_nmf['n_tags_recommended_nmf'] = df_test_nmf.apply(lambda x : len(x['recommendation_nmf']),axis=1)
df_test_lda['n_tags_recommended_lda'] = df_test_lda.apply(lambda x : len(x['recommendation_lda']),axis=1)
df_test_lda_word2vec['n_tags_recommended_lda_w2v'] = df_test_lda_word2vec.apply(lambda x : len(x['recommendation_lda_w2v']),axis=1)

df_test_nmf['matching_tags_test_nmf'] = df_test_nmf.apply(lambda x : set(x['recommendation_nmf']).intersection(set(x['tag_list'])), axis=1)
df_test_lda['matching_tags_test_lda'] = df_test_lda.apply(lambda x : set(x['recommendation_lda']).intersection(set(x['tag_list'])), axis=1)
df_test_lda_word2vec['matching_tags_test_lda_w2v'] = df_test_lda_word2vec.apply(lambda x : set(x['recommendation_lda_w2v']).intersection(set(x['tag_list'])), axis=1)

df_test_nmf['matching_score_test_nmf'] = df_test_nmf.apply(lambda x : len(x['matching_tags_test_nmf'])/x['n_tags'], axis=1)
df_test_lda['matching_score_test_lda'] = df_test_lda.apply(lambda x : len(x['matching_tags_test_lda'])/x['n_tags'], axis=1)
df_test_lda_word2vec['matching_score_test_lda_w2v'] = df_test_lda_word2vec.apply(lambda x : len(x['matching_tags_test_lda_w2v'])/x['n_tags'], axis=1)

In [171]:
df_evaluation = df_test.copy()
df_evaluation = df_evaluation[['id','n_tags','matching_score']]
df_evaluation = pd.merge(df_evaluation,df_test_nmf[['id','n_tags_recommended_nmf','matching_score_test_nmf']]\
                         ,how='left'\
                         ,on='id')

df_evaluation = pd.merge(df_evaluation,df_test_lda[['id','n_tags_recommended_lda','matching_score_test_lda']]\
                         ,how='left'\
                         ,on='id')

df_evaluation = pd.merge(df_evaluation,df_test_lda_word2vec[['id','n_tags_recommended_lda_w2v','matching_score_test_lda_w2v']]\
                         ,how='left'\
                         ,on='id')
df_evaluation

Unnamed: 0,id,n_tags,matching_score,n_tags_recommended_nmf,matching_score_test_nmf,n_tags_recommended_lda,matching_score_test_lda,n_tags_recommended_lda_w2v,matching_score_test_lda_w2v
0,1,3,0.333333,3,0.333333,5,0.666667,6,0.666667
1,4,3,0.000000,6,0.000000,7,0.333333,6,0.000000
2,14,3,0.666667,9,0.666667,11,0.666667,16,0.666667
3,44,2,0.500000,8,0.500000,7,0.500000,6,0.500000
4,45,1,1.000000,27,1.000000,13,1.000000,11,1.000000
5,47,3,1.000000,10,1.000000,9,1.000000,11,1.000000
6,52,2,0.000000,3,0.000000,6,0.000000,4,0.000000
7,55,2,0.500000,12,0.500000,10,1.000000,10,0.500000
8,59,1,1.000000,4,1.000000,6,1.000000,8,1.000000
9,65,3,0.000000,7,0.000000,4,0.000000,9,0.666667


<p>
        **<font color='#D2691E'size="4">II) Evaluation & Benchmark of the algorithms performances</font>**
</p>

In [172]:
import math

In [189]:
df_evaluation['nmf_tags_number_accuracy'] = df_evaluation.apply(lambda x:
                                                                math.exp((x['n_tags']-x['n_tags_recommended_nmf'])/x['n_tags_recommended_nmf'])
                                                                if x['n_tags']< x['n_tags_recommended_nmf']
                                                                else
                                                                math.exp((x['n_tags_recommended_nmf']-x['n_tags'])/x['n_tags'])
                                                                ,axis=1)

df_evaluation['lda_tags_number_accuracy'] = df_evaluation.apply(lambda x:
                                                                math.exp((x['n_tags']-x['n_tags_recommended_lda'])/x['n_tags_recommended_lda'])
                                                                if x['n_tags']< x['n_tags_recommended_lda']
                                                                else
                                                                math.exp((x['n_tags_recommended_lda']-x['n_tags'])/x['n_tags'])
                                                                ,axis=1)

df_evaluation['lda_w2v_tags_number_accuracy'] = df_evaluation.apply(lambda x:
                                                                math.exp((x['n_tags']-x['n_tags_recommended_lda_w2v'])/x['n_tags_recommended_lda_w2v'])
                                                                if x['n_tags']< x['n_tags_recommended_lda_w2v']
                                                                else
                                                                math.exp((x['n_tags_recommended_lda_w2v']-x['n_tags'])/x['n_tags'])
                                                                ,axis=1)

In [190]:
orig_avg_accuracy = np.mean(df_evaluation['matching_score'])
nmf_avg_accuracy = np.mean(df_evaluation['matching_score_test_nmf'])
lda_avg_accuracy = np.mean(df_evaluation['matching_score_test_lda'])
lda_w2v_avg_accuracy = np.mean(df_evaluation['matching_score_test_lda_w2v'])

In [191]:
nmf_avg_tags_number_accuracy = np.mean(df_evaluation['nmf_tags_number_accuracy'])
lda_avg_tags_number_accuracy = np.mean(df_evaluation['lda_tags_number_accuracy'])
lda_w2v_avg_tags_number_accuracy = np.mean(df_evaluation['lda_w2v_tags_number_accuracy'])

In [192]:
nmf_global_accuracy = (2*nmf_avg_accuracy+nmf_avg_tags_number_accuracy)/3
lda_global_accuracy = (2*lda_avg_accuracy+lda_avg_tags_number_accuracy)/3
lda_global_avg_accuracy = (2*lda_w2v_avg_accuracy+lda_w2v_avg_tags_number_accuracy)/3

In [193]:
print("orig_avg_accuracy : %s"%orig_avg_accuracy)
print("nmf_avg_accuracy : %s"%nmf_avg_accuracy)
print("lda_avg_accuracy : %s"%lda_avg_accuracy)
print("lda_w2v_avg_accuracy : %s"%lda_w2v_avg_accuracy)

print()
print("nmf_avg_tags_number_accuracy : %s"%nmf_avg_tags_number_accuracy)
print("lda_avg_tags_number_accuracy : %s"%lda_avg_tags_number_accuracy)
print("lda_w2v_avg_tags_number_accuracy : %s"%lda_w2v_avg_tags_number_accuracy)

print()
print("nmf_global_accuracy : %s"%nmf_global_accuracy)
print("lda_global_accuracy : %s"%lda_global_accuracy)
print("lda_global_avg_accuracy : %s"%lda_global_avg_accuracy)

orig_avg_accuracy : 0.53213836478
nmf_avg_accuracy : 0.593610062893
lda_avg_accuracy : 0.566503144654
lda_w2v_avg_accuracy : 0.66841509434

nmf_tags_number_accuracy : 0.528474936899
lda_tags_number_accuracy : 0.517045074032
lda_w2v_tags_number_accuracy : 0.504521133694

nmf_global_accuracy : 0.571898354229
lda_global_accuracy : 0.550017121114
lda_global_avg_accuracy : 0.613783774124


In [194]:
orig_med_accuracy = np.mean(df_evaluation['matching_score'])
nmf_med_accuracy = np.mean(df_evaluation['matching_score_test_nmf'])
lda_med_accuracy = np.mean(df_evaluation['matching_score_test_lda'])
lda_w2v_med_accuracy = np.mean(df_evaluation['matching_score_test_lda_w2v'])

In [196]:
orig_avg_tags_number = np.mean(df_evaluation['n_tags'])
nmf_avg_tags_number = np.mean(df_evaluation['n_tags_recommended_nmf'])
lda_avg_tags_number = np.mean(df_evaluation['n_tags_recommended_lda'])
lda_w2v_avg_tags_number = np.mean(df_evaluation['n_tags_recommended_lda_w2v'])

In [216]:
orig_med_tags_number = np.median(df_evaluation['n_tags'])
nmf_med_tags_number = np.median(df_evaluation['n_tags_recommended_nmf'])
lda_med_tags_number = np.median(df_evaluation['n_tags_recommended_lda'])
lda_w2v_med_tags_number = np.median(df_evaluation['n_tags_recommended_lda_w2v'])

In [None]:
RUN = execution_params_workflows_iteration['RUN']
N_COMPONENTS = execution_params_workflows_iteration['N_COMPONENTS']
TOPICS_THRESHOLD = execution_params_workflows_iteration['TOPICS_THRESHOLD']
NEIGHBORS = execution_params_workflows_iteration['NEIGHBORS']
QUANTILE_THRESHOLD = execution_params_workflows_iteration['QUANTILE_THRESHOLD']

In [217]:
dict_results = {'RUN':RUN,
                'N_COMPONENTS':N_COMPONENTS,
                'TOPICS_THRESHOLD':TOPICS_THRESHOLD,
                'NEIGHBORS':NEIGHBORS,
                'QUANTILE_THRESHOLD':QUANTILE_THRESHOLD,
                'orig_avg_accuracy':orig_avg_accuracy,
                'nmf_avg_accuracy':nmf_avg_accuracy,
                'lda_avg_accuracy':lda_avg_accuracy,
                'lda_w2v_avg_accuracy':lda_w2v_avg_accuracy,
                'nmf_avg_tags_number_accuracy':nmf_avg_tags_number_accuracy,
                'lda_avg_tags_number_accuracy':lda_avg_tags_number_accuracy,
                'lda_w2v_avg_tags_number_accuracy':lda_w2v_avg_tags_number_accuracy,
                'nmf_global_accuracy':nmf_global_accuracy,
                'lda_global_accuracy':lda_global_accuracy,
                'lda_global_avg_accuracy':lda_global_avg_accuracy,
                'orig_med_accuracy':orig_med_accuracy,
                'nmf_med_accuracy':nmf_med_accuracy,
                'lda_med_accuracy':lda_med_accuracy,
                'lda_w2v_med_accuracy':lda_w2v_med_accuracy,
                'orig_avg_tags_number':orig_avg_tags_number,
                'nmf_avg_tags_number':nmf_avg_tags_number,
                'lda_avg_tags_number':lda_avg_tags_number,
                'lda_w2v_avg_tags_number':lda_w2v_avg_tags_number,
                'orig_med_tags_number':orig_med_tags_number,
                'nmf_med_tags_number':nmf_med_tags_number,
                'lda_med_tags_number':lda_med_tags_number,
                'lda_w2v_med_tags_number':lda_w2v_med_tags_number
               }

In [218]:
results_columns = \
['RUN','N_COMPONENTS','TOPICS_THRESHOLD','NEIGHBORS','QUANTILE_THRESHOLD','orig_avg_accuracy','nmf_avg_accuracy',\
 'lda_avg_accuracy','lda_w2v_avg_accuracy','nmf_avg_tags_number_accuracy','lda_avg_tags_number_accuracy',\
 'lda_w2v_avg_tags_number_accuracy','nmf_global_accuracy','lda_global_accuracy','lda_global_avg_accuracy',\
 'orig_med_accuracy','nmf_med_accuracy','lda_med_accuracy','lda_w2v_med_accuracy','orig_avg_tags_number',\
 'nmf_avg_tags_number','lda_avg_tags_number','lda_w2v_avg_tags_number','orig_med_tags_number',\
 'nmf_med_tags_number','lda_med_tags_number','lda_w2v_med_tags_number']

In [219]:
results_df = pd.DataFrame(dict_results,columns = results_columns,index=[RUN])
results_df

Unnamed: 0,orig_avg_accuracy,nmf_avg_accuracy,lda_avg_accuracy,lda_w2v_avg_accuracy,nmf_avg_tags_number_accuracy,lda_avg_tags_number_accuracy,lda_w2v_avg_tags_number_accuracy,nmf_global_accuracy,lda_global_accuracy,lda_global_avg_accuracy,...,lda_med_accuracy,lda_w2v_med_accuracy,orig_avg_tags_number,nmf_avg_tags_number,lda_avg_tags_number,lda_w2v_tags_number,orig_med_tags_number,nmf_med_tags_number,lda_med_tags_number,lda_w2v_tags_number.1
1,0.532138,0.59361,0.566503,0.668415,0.528475,0.517045,0.504521,0.571898,0.550017,0.613784,...,0.566503,0.668415,2.119245,7.790189,7.038491,8.0,2.0,7.0,7.0,8.0


In [220]:
pickle.dump(results_df,open(pickles_path+"1_workflows_iteration_results_df.p", "wb"))