In [1]:
import time

# work with directories
import os 
from pathlib import Path 

# datetime
import datetime as dt

# work with data
import numpy as np
import pandas as pd
# import pandas_datareader.data as web # not needed for this code

# visualize data
import matplotlib.pyplot as plt

# create bag of word vector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# gensim
import pyLDAvis.gensim_models
from gensim.models import LdaMulticore, TfidfModel
from gensim.corpora import Dictionary
import gensim._matutils
import multiprocessing
import IPython

In [2]:
# This file was generated in '3_Final_Data_Preprocessing.ipynb'.
#merged_df = pd.read_pickle(('4. clean_corpus/clean_corpus_2015_PublicFirms.pkl'))
merged_df = pd.read_csv("data/CHD.csv")

In [3]:
# Sort the dataframe by the 'issue_date' column
merged_df['issue_date'] = pd.to_datetime(merged_df['issue_date'])
merged_df = merged_df.sort_values('issue_date').reset_index(drop=True)
merged_df.head()

Unnamed: 0,text,patent_id,ticker,numpat_total,issue_date,sector,clean_corpus
0,FIELD OF THE INVENTION\r\n\r\nThe present inve...,7665418,CHD,395,2010-02-23,Consumer Staples,field invent present invent relat absorb compo...
1,FIELD OF THE INVENTION\r\n\r\nThe present inve...,7723629,CHD,395,2010-05-25,Consumer Staples,field invent present invent relat field toothb...
2,RELATED APPLICATIONS AND PATENTS\r\n\r\nThe pr...,7763454,CHD,395,2010-07-27,Consumer Staples,relat applic patent present invent relat u.s. ...
3,FIELD OF THE INVENTION\r\n\r\nThe present inve...,7772578,CHD,395,2010-08-10,Consumer Staples,field invent present invent relat diagnost tes...
4,FIELD OF THE INVENTION\r\n\r\nThe invention re...,7776618,CHD,395,2010-08-17,Consumer Staples,field invent invent relat diagnost assay analy...


In [4]:
# we select only text data from clean_corpus column and use patent_id as the index for each patent information
corpus = merged_df.set_index('patent_id')['clean_corpus']
# corpus = corpus.iloc[:1000]
corpus.iloc[:5]

patent_id
7665418    field invent present invent relat absorb compo...
7723629    field invent present invent relat field toothb...
7763454    relat applic patent present invent relat u.s. ...
7772578    field invent present invent relat diagnost tes...
7776618    field invent invent relat diagnost assay analy...
Name: clean_corpus, dtype: object

In [5]:
# remove rows with no patent data information
corpus.dropna(axis=0, inplace=True)
corpus.iloc[:5]

patent_id
7665418    field invent present invent relat absorb compo...
7723629    field invent present invent relat field toothb...
7763454    relat applic patent present invent relat u.s. ...
7772578    field invent present invent relat diagnost tes...
7776618    field invent invent relat diagnost assay analy...
Name: clean_corpus, dtype: object

In [None]:
# create tfidf vector of the clean corpus
# tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
# tfidf_vector = tfidf_vectorizer.fit_transform(corpus)
# tfidf_vector

## TOPIC MODELLING
The data that we collect in our count or TFIDF matrix does not include the complexity of the natural language which has many latent (hidden) dimensions. Societal norms, goal of communication, discourse and many other complexities form and influence natural language. Thus, even words and documents can be understood as simply a representation of these processes.

Finding these latent factors is what matrix factorization does, i.e. represent the documents and terms as separate entities connected through latent dimensions. Thus, the latent dimensions can be a two or three dimensional space encoding the same information as the documents in the corpus. The patterns that emerge can even tell us what each of those latent dimensions may represent. Some popular matrix factorization methods include:

1. Singular Value Decomposition (SVD)

2. Nonnegative Matrix Factorization (NMF)

3. t-Distributed Dtochastic Neighbor Embedding (t-SNE)

The term topic model covers a whole class of generative probabilistic models with Latent Dirichlet Allocation (LDA) being the most well known variation. While the count and TFIDF matrix provides us with information on individual words and/or phrases, they do not capture the larger context of the documents. The topic modelling technique tries to capture some of these high level overview of the themes, issues or concerns being discussed in the corpus.

That LDA is a generative model means that it understands words in the document as being generated from some distribution. So, each word is conditioned on a topic (rather than previous words). Therefore, first it choses a topic and then selects a word based on that topic. This is not how documents are generated in real life but this is how LDA models them. It is a useful abstraction because if the documents were indeed generated as LDA models it, then we have uncovered the latent dimensions. The best interpretation of each topic can be made by the distribution over words in that topic. The top most associated words for a given topic can help us provide a descriptive label to that topic. Thus, an expertise in the topic is also important to best identify the labels.

If we expect every documents to have all of the topics included meaning a uniform distribution then the value of hyperparameter alpha can be set to 1 or more. If we suspect that there are only a few topics per document then we can set alpha to be smaller than 1. Similarly, if we think that words are very specific to a particular topic, we use small beta value (0.01 or less) and if we think they are quite general then we use values up to 1. The lower the value the more peaked will be the underlying distribution. Methods such as Gibbs sampling or expectation maximization can be used to find better paraters that fit the model to the data.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [None]:
#https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

# ALPHA HYPERPARAMETER
# doc_topic_prior: float, default=None
# Prior of document topic distribution theta. If the value is None, defaults to 1 / n_components. In [1], this is called alpha.

# ETA HYPERPARAMETER
# topic_word_prior: float, default=None
# Prior of topic word distribution beta. If the value is None, defaults to 1 / n_components. In [1], this is called eta.


In [None]:
# higher alpha assumes documents to be made up of more topics and 
# results in more specific topic distribution per document.
# with high eta, topics are assumed to be made up of most of the words and 
# results in a more specific word distribution per topic.

# Tweak the two parameters below
number_topics = 30 # K parameter
number_words = 10

# Create and fit the LDA model
#lda_tfidf = LDA(n_components=number_topics, n_jobs=-1, doc_topic_prior=0.01/number_topics)
lda_tfidf = LDA(n_components=number_topics, n_jobs=-1, doc_topic_prior=0.1) # Low alpha
#lda_tfidf = LDA(n_components=number_topics, n_jobs=-1, doc_topic_prior=1)
#lda_tfidf = LDA(n_components=number_topics, n_jobs=-1, doc_topic_prior=5) # High alpha

lda_tfidf_fit = lda_tfidf.fit(tfidf_vector)
lda_tfidf_fit

In [None]:
def topics(lda_model, vectorizer, n_top_words):

    words = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda_model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [None]:
# Print the topics found by the LDA model
print("LDA Topics found from TFIDF vectors: (top 20 words per topic)")
topics(lda_tfidf, tfidf_vectorizer, number_words)

## VISUALIZATION

In [16]:
# https://anaconda.org/conda-forge/pyldavis
#conda install -c conda-forge pyldavis
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models
import IPython
from gensim.models import LdaModel


In [None]:
gensim_corpus = [doc.split() for doc in corpus.to_list()]
dictionary = Dictionary(gensim_corpus)
ldacorpus = [dictionary.doc2bow(doc) for doc in gensim_corpus]

In [None]:
# Train the LDA model
num_topics = 30
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

In [None]:
# enable visualization on notebook
pyLDAvis.enable_notebook()

tfidfmodel = TfidfModel(ldacorpus, num_topics=10, id2word=dictionary)

# prepare visualization by providing data and info
LDAvis_prepared = pyLDAvis.gensim_models.prepare(tfidfmodel, ldacorpus, dictionary)#, mds='tsne')


# show visualization
pyLDAvis.show(LDAvis_prepared)

# save visualization as html file
# pyLDAvis.save_html(LDAvis_prepared, 'LDAvis_tfidf__' + str(number_topics) +'.html')

In [None]:
# load and display visualization on notebook
# TFIDF VECTOR BASED
IPython.display.HTML(filename= 'LDAvis_tfidf__' + str(number_topics) +'.html')

## LDA GENSIM IMPLEMENTATION

Test for one company

In [6]:
corpus

patent_id
7665418     field invent present invent relat absorb compo...
7723629     field invent present invent relat field toothb...
7763454     relat applic patent present invent relat u.s. ...
7772578     field invent present invent relat diagnost tes...
7776618     field invent invent relat diagnost assay analy...
                                  ...                        
9939385     field present invent relat field analyt detect...
9949916     field invent invent direct non-irrit person lu...
9970923     field invent invent relat diagnost assay analy...
10001449    background field invent present invent relat d...
10101342    background invent field invent present invent ...
Name: clean_corpus, Length: 70, dtype: object

In [7]:
gensim_corpus = [doc.split() for doc in corpus.to_list()]
gensim_corpus[:1]

[['field',
  'invent',
  'present',
  'invent',
  'relat',
  'absorb',
  'composit',
  'anim',
  'dross',
  'method',
  'use',
  'particular',
  'present',
  'invent',
  'relat',
  'litter',
  'display',
  'color',
  'speckl',
  'without',
  'color',
  'transfer',
  'background',
  'invent',
  'befor',
  'advent',
  'litter',
  'pet',
  'owner',
  'releg',
  'pet',
  'outsid',
  'home',
  'lack',
  'area',
  'take',
  'care',
  'pet',
  'excrement',
  'litter',
  'allow',
  'pet',
  'take',
  'care',
  'wast',
  'function',
  'live',
  'insid',
  'home',
  'house-broken',
  'anim',
  'cat',
  'train',
  'habit',
  'urin',
  'defec',
  'special',
  'provid',
  'litter',
  'box',
  'similar',
  'untrain',
  'cage',
  'anim',
  'guinea',
  'pig',
  'urin',
  'defec',
  'floor',
  'cage',
  'often',
  'approxim',
  'floor',
  'area',
  'cage',
  'consequ',
  'pet',
  'owner',
  'homeown',
  'veterinarian',
  'laboratori',
  'personnel',
  'ad',
  'absorb',
  'materi',
  'litter',
  'box',


In [8]:
# create dictionary from corpus
dictionary = Dictionary(gensim_corpus)
# dictionary.filter_extremes(no_below=100, no_above=0.1)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x23e995b8a30>

In [9]:
ldacorpus = [dictionary.doc2bow(doc) for doc in gensim_corpus]
ldacorpus[:3]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 2),
  (21, 42),
  (22, 1),
  (23, 1),
  (24, 2),
  (25, 6),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 2),
  (32, 2),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 2),
  (38, 15),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 2),
  (43, 2),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 2),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 2),
  (56, 1),
  (57, 9),
  (58, 1),
  (59, 9),
  (60, 1),
  (61, 2),
  (62, 1),
  (63, 5),
  (64, 1),
  (65, 2),
  (66, 2),
  (67, 1),
  (68, 1),
  (69, 2),
  (70, 3),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 13),
  (76, 1),
  (77, 3),
  (78, 1),
  (79, 9),
  (80, 3),
  (81, 1),
  (82, 6),
  (83, 2),
  (84, 2),
  (85, 3),
  (86, 1),
  (87, 2),
  (88, 1),
  (89, 1),
  (90, 8),
  (91,

In [10]:
dictionary[100] # 100 unique id or index given to word liquid in the dictionary

'current'

In [11]:
# tfidf model
tfidfmodel = TfidfModel(ldacorpus)
tfidfmodel

<gensim.models.tfidfmodel.TfidfModel at 0x23eaf2aa5b0>

In [12]:
# transform to the format required by gensim lda function
tfidfmodel_corpus = tfidfmodel[ldacorpus]
tfidfmodel_corpus

<gensim.interfaces.TransformedCorpus at 0x23eaf4e4160>

In [13]:
# LDA model
number_topics = 20

# find chunksize to make about 200 updates
lda_gensim = LdaMulticore(tfidfmodel_corpus, 
                          id2word=dictionary,
                          num_topics=number_topics,
                          workers=min(4, multiprocessing.cpu_count()-1), # number of cores
                          passes=10, # training passes
                          chunksize=200, # no. of documents in each training chuck
                         )
lda_gensim

<gensim.models.ldamulticore.LdaMulticore at 0x23e995c64f0>

In [14]:
# Visualization

pyLDAvis.enable_notebook()

LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_gensim, ldacorpus, dictionary)#, mds='tsne')

pyLDAvis.save_html(LDAvis_prepared, 'LDAvis_tfidf__' + str(number_topics) +'.html')


In [17]:
IPython.display.HTML(filename= 'LDAvis_tfidf__' + str(number_topics) +'.html')

In [18]:
# Get the theta parameters for each patent
theta_array = []

for document in tfidfmodel_corpus:
    doc_topics = lda_gensim.get_document_topics(document, minimum_probability=0.0)
    topic_weights = [(topic_id, weight) for topic_id, weight in doc_topics]
    theta_array.append(topic_weights)


In [19]:
# Cosine similarity

'''
This code calculates the cosine similarity between each document and the previous 5, 10 and 20 documents, using the topics' weights of each doc.

The similarity ranges from -1 to 1 and larger numbers mean more similar.

Sources:

Paper: we calculate backward_cosine as the average cosine similarity 
between a focal patent and all patents filed in the five years before the 
focal patent.

https://medium.com/@dudsdu/topic-modelling-for-finding-similar-contracts-df00b3aea8b2

'''

# Initialize an empty dataframe to store the results
result_df = pd.DataFrame(columns=['avg_similarity_5', 'avg_similarity_10', 'avg_similarity_20'])

# Iterate over all the documents
for document_index in range(70):
    previous_documents = [i for i in range(document_index)]

    similarities = []

    for prev_index in previous_documents:
        similarity = gensim.matutils.cossim(theta_array[document_index], theta_array[prev_index])
        similarities.append(similarity)

    avg_similarity_5 = np.mean(similarities[:5])
    avg_similarity_10 = np.mean(similarities[:10])
    avg_similarity_20 = np.mean(similarities[:20])

    # Append the results to the dataframe
    result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
                                  'avg_similarity_10': avg_similarity_10,
                                  'avg_similarity_20': avg_similarity_20},
                                 ignore_index=True)

result_df

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  result_df = result_df.append({'avg_similarity_5': avg_similarity_5,
  

Unnamed: 0,avg_similarity_5,avg_similarity_10,avg_similarity_20
0,,,
1,0.032421,0.032421,0.032421
2,0.113592,0.113592,0.113592
3,0.268313,0.268313,0.268313
4,0.206417,0.206417,0.206417
...,...,...,...
65,0.307446,0.352547,0.212911
66,0.098911,0.136296,0.157410
67,0.439814,0.293230,0.210477
68,0.308625,0.221798,0.203259


In [20]:
# Concatenate original df with new variables

new_df = pd.concat([merged_df,result_df], axis=1)
new_df 

Unnamed: 0,text,patent_id,ticker,numpat_total,issue_date,sector,clean_corpus,avg_similarity_5,avg_similarity_10,avg_similarity_20
0,FIELD OF THE INVENTION\r\n\r\nThe present inve...,7665418,CHD,395,2010-02-23,Consumer Staples,field invent present invent relat absorb compo...,,,
1,FIELD OF THE INVENTION\r\n\r\nThe present inve...,7723629,CHD,395,2010-05-25,Consumer Staples,field invent present invent relat field toothb...,0.032421,0.032421,0.032421
2,RELATED APPLICATIONS AND PATENTS\r\n\r\nThe pr...,7763454,CHD,395,2010-07-27,Consumer Staples,relat applic patent present invent relat u.s. ...,0.113592,0.113592,0.113592
3,FIELD OF THE INVENTION\r\n\r\nThe present inve...,7772578,CHD,395,2010-08-10,Consumer Staples,field invent present invent relat diagnost tes...,0.268313,0.268313,0.268313
4,FIELD OF THE INVENTION\r\n\r\nThe invention re...,7776618,CHD,395,2010-08-17,Consumer Staples,field invent invent relat diagnost assay analy...,0.206417,0.206417,0.206417
...,...,...,...,...,...,...,...,...,...,...
65,FIELD\r\n\r\nThe present invention relates to ...,9939385,CHD,395,2018-04-10,Consumer Staples,field present invent relat field analyt detect...,0.307446,0.352547,0.212911
66,FIELD OF THE INVENTION\r\n\r\nThis invention i...,9949916,CHD,395,2018-04-24,Consumer Staples,field invent invent direct non-irrit person lu...,0.098911,0.136296,0.157410
67,FIELD OF THE INVENTION\r\n\r\nThe invention re...,9970923,CHD,395,2018-05-15,Consumer Staples,field invent invent relat diagnost assay analy...,0.439814,0.293230,0.210477
68,BACKGROUND\r\n\r\nField of the Invention\r\n\r...,10001449,CHD,395,2018-06-19,Consumer Staples,background field invent present invent relat d...,0.308625,0.221798,0.203259


# Optimal number of topics

In [None]:
from gensim.models import CoherenceModel

In [None]:
# initialize list to save coherence values and lda model
coherence_values = []
lda_models = []

# for each value of n
for n in range(5, 21):

    # train the lda model with n topics
    model = LdaMulticore(tfidfmodel_corpus, 
                         id2word=dictionary,
                         num_topics=n,
                         workers=min(8, multiprocessing.cpu_count()-1), # number of cores
                         passes=10, # training passes
                         chunksize=200, # no. of documents in each training chuck
                         )
    # add the model to the corresponding initialized list
    lda_models.append(model)
    
    # create coherence model with the above trained lda model
    coherencemodel = CoherenceModel(model=model, texts=gensim_corpus, dictionary=dictionary, coherence='c_v')
    
    # add the value for n and the coherence to the corresponding initialized list
    coherence_val = coherencemodel.get_coherence()
    print(n, coherence_val)
    coherence_values.append( (n, coherence_val) )

In [None]:
plt.style.use('ggplot')
plt.plot(*zip(*coherence_values))
plt.title('Coherence Values for LDA models with Different Number of Topics\n')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence')
plt.show()