In [4]:

from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

import nltk
import pandas as pd
import matplotlib.pyplot as plt
import importlib
import os

current_dir = os.getcwd()
%cd ..
import textmining.util_functions as uf
import textmining.text_miner 
import textmining.topic_modeler as tm

importlib.reload(textmining.text_miner)
importlib.reload(textmining.topic_modeler)
importlib.reload(uf)
os.chdir(current_dir)


import seaborn as sns
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(7,5)}, 
    style="white" # nicer layout
)
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt



C:\Users\elba_ro\Documents\dlr_projects\repository-synergy\notebooks


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\elba_ro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# NMF based topic modelling - with tfidf

In [5]:
data = pd.read_csv('../../data/processed/final_repo_english_whatwhy.csv', index_col='file_id')
data.columns

Index(['local_readme_file', 'url_readme', 'created_at', 'language', 'name',
       'owner_id', 'updated_at', 'url', 'watchers', 'members_number',
       'repo_labels', 'all_languages', 'topics', 'description', 'members_ids',
       'content_text_w_o_tags', 'section_id', 'content_clean'],
      dtype='object')

In [None]:
stop_words_extension = ['abstr_hyperlink', 'https', 'abstr_number', 'abstr_image',
                        'abstr mailto', 'abstr_code_section', 'abstr_image', 'abstr_number', 'http', 'www', 'please']
nmf_modeler = tm.topic_modeler(data['content_clean'].values.tolist(),
                               stopwords_extension= stop_words_extension)


## Plotting cohenrence for each K topics

In [None]:
start= 40#125## 85 # 45
step=1
limit=146 # not included
#nmf_model_list=None

In [None]:
nmf_model_list, coherence_vals = nmf_modeler.compute_coherence_values(limit, start=start, step=step,
                                                                                      model_type='nmf',
                                                                                     corpus_type ='tfidf')



In [None]:
## Saving results 
n = nmf_modeler.num_topics
print('best model has {} topics'.format(n))

coherence_vals_df =  pd.DataFrame({'Number of Topics':range(start, limit, step), 'Coherence Value':coherence_vals} )
coherence_vals_df.to_csv('../../data/feature_extraction/nmf/nmf_coherence_values_{}_{}.csv'.format(start, (limit-1)))

In [None]:
def save_model_info(modeler):
    n = modeler.num_topics
    print('get_doc_dominant_topic')
    nmf_readable_document_topics_df =  modeler.get_doc_dominant_topic(save_path= "../../data/feature_extraction/nmf/nmf_readable_document_topics_{}.csv".format(n))
    
    print('get_topics_terms')
    nmf_topic_words_df = modeler.get_topics_terms(save_path= "../../data/feature_extraction/nmf/nmf_topic_words_{}.csv".format(n))
    
    print('get_doc_topic_matrix')
    nmf_document_topics_df = modeler.get_doc_topic_matrix(save_path= "../../data/feature_extraction/nmf/nmf_document_topics_{}.csv".format(n))
    
    return nmf_readable_document_topics_df, nmf_topic_words_df, nmf_document_topics_df

In [None]:
nmf_readable_document_topics_df, nmf_topic_words_df, nmf_document_topics_df = save_model_info(nmf_modeler)

### Ploting Topics Distributions

In [None]:
topic_dstr = nmf_modeler.get_topic_distr()
topic_dstr['Dominant_Topic'].value_counts().plot(kind='bar')


In [None]:
topic_dstr['Topic_Keywords'].fillna('No Topic', inplace=True)

In [None]:
topic_dstr.to_csv('../../data/feature_extraction/nmf/nmf_topics_distribution_{}.csv'.format(n))

#### Docs dominant Topics

In [None]:
#doc_dominant_topic_df.reset_index().to_csv('../../data/feature_extraction/nmf_document_topics_25.csv')
nmf_readable_document_topics_df.head(10)

#### Wordclouds for each topic

In [None]:
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors


topics = nmf_modeler.model.show_topics(num_topics=35, num_words=10, formatted=False)



def show_word_cloud(topics, limit=10, stopwords_extension=[], topic_index=range(1, 11)):
    # 1. Wordcloud of Top N words in each topic

    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

    stop_words = stopwords.words('english')
    stop_words.extend(stopwords_extension)

    cloud = WordCloud(stopwords=stop_words,
                      background_color='white',
                      width=2500,
                      height=1800,
                      max_words=10,
                      colormap='tab10',
                      color_func=lambda *args, **kwargs: cols[i],
                      prefer_horizontal=1.0)

    fig, axes = plt.subplots(int(limit / 2), 2, figsize=(10, 10), sharex='all', sharey='all')

    for i, ax in enumerate(axes.flatten()):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(topic_index[i]), fontdict=dict(size=16))
        plt.gca().axis('off')
        if i > limit:
            break

    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.show()

In [None]:

show_word_cloud(topics[:10],  limit=10, stopwords_extension=stop_words_extension, topic_index=range(0,10))

In [None]:
show_word_cloud(topics[10:20],  limit=10, stopwords_extension=stop_words_extension, topic_index=range(10,20))

In [None]:
show_word_cloud(topics[20:30],  limit=10, stopwords_extension=stop_words_extension, topic_index=range(20,30))

In [None]:
show_word_cloud(topics[30:35],  limit=5, stopwords_extension=stop_words_extension, topic_index=range(30,35))

In [None]:
show_word_cloud(topics_tfid[:10],  limit=10, stopwords_extension=stop, topic_index=range(0,10))

### 1. Extract TF-IDF matrix for all documents

In [None]:
stopwords_extension = ['abstr_hyperlink', 'https', 'abstr_number', 'abstr_image', 'http', '_', 'www']
stop_words  = stopwords.words('english')
stop_words.extend(stopwords_extension)
vectoriser = TfidfVectorizer(tokenizer=uf.tokenize, stop_words=stop_words)
tfidf_model = vectoriser.fit_transform(data.content_text_w_o_tags)

### 2. Topic extraction

**Estimating the number of topics**

The number of components / topics can be estimated using singular value decomposition (SVD).
The explained variance ratio gives a weight to each component. Low values indicate that one cannot gain much more information when the number of components increases.

In [None]:
u, s, vt = svds(tfidf_model, k = 250)

In [None]:
from kneed import KneeLocator


In [None]:
fig, ax = plt.subplots()
x= [i for i in range(1,s.size + 1)]
y = [v for v in reversed(s)]
kn = KneeLocator(x, y, curve='convex', direction='decreasing')
print(kn.elbow)

ax.plot(x,y )
plt.xlabel('singular value index')
plt.ylabel('value')
plt.title("Singular values")

The number of topics will be fixed to 35 since with more components there is not much more variance in the data.

**NMF modelling**

In [None]:
nmf_model = NMF(35, init="nndsvd")
document_topic_mat = nmf_model.fit_transform(tfidf_model)
topic_word_mat = nmf_model.components_

In [None]:
uf.print_top_words(nmf_model, vectoriser.get_feature_names(), 10)

In [None]:
#pd.DataFrame(document_topic_mat).to_csv("../../data/feature_extraction/nmf_document_topics_50.csv")

In [None]:
#pd.DataFrame(topic_word_mat, columns=vectoriser.get_feature_names()).to_csv("../../data/feature_extraction/nmf_topic_words_50.csv")