In [1]:
from collections import defaultdict
from gensim import corpora
from gensim.parsing.preprocessing import remove_stopwords
import numpy as np
import os
import pandas as pd

In [2]:
#Create a document, which is in list this case
row_doc = ["Analysts had forecast that “Cyberpunk” would sell anywhere from 18 million to 25 million copies in its first month.",
           "Cyberpunk is the biggest and most expensive project yet for the more than two-decade-old videogame company.",
           "It is unusual for game publishers to suggest players seek refunds for their big-budget releases."
           "Several other prominent developers postponed games this year, citing work-from-home challenges caused by the health crisis as a key factor.",
           "The company has only one other major franchise, The Witcher, and its last major release is now five years old."
]
print(row_doc)

#While in str-like objects in lsi data processing, it would be more convenient to use pd.Series data 
#to call relevenat processing function as shwon in follwoing codes

['Analysts had forecast that “Cyberpunk” would sell anywhere from 18 million to 25 million copies in its first month.',
 'Cyberpunk is the biggest and most expensive project yet for the more than two-decade-old videogame company.',
 'It is unusual for game publishers to suggest players seek refunds for their big-budget releases.Several other prominent developers postponed games this year, citing work-from-home challenges caused by the health crisis as a key factor.',
 'The company has only one other major franchise, The Witcher, and its last major release is now five years old.']

In [6]:
print(type(row_doc[0]))

<class 'str'>


Data Preprocessing:
Some approaches

In [90]:
#1. Process the row document
#We consider series object to make it easily processible:
row_doc_ser = pd.Series(row_doc)

#1a. Approach 1: Step by step 
#replace "," with "" so that no comma points ever in the documents
row_doc_ser.replace(",", "")

#Convert string element in the df/series in to lower cases
doc_lower = []
for i in range(len(row_doc_ser)):
    a = row_doc_ser[i].lower()
    doc_lower.append(a)
#print(doc_lower)

doc_vector = []
for k in range(len(doc_lower)):
    doc_vector.append(remove_stopwords(doc_lower[k]))
#print(doc_vector)

doc_vector_split = []
for m in range(len(doc_vector)):
    doc_vector_split.append(doc_vector[m].split())
    
print(row_doc_ser[0])
print(doc_vector_split[0])
print('--------------------------------------------------------')
print(doc_vector_split)

Analysts had forecast that “Cyberpunk” would sell anywhere from 18 million to 25 million copies in its first month.
['analysts', 'forecast', '“cyberpunk”', 'sell', '18', 'million', '25', 'million', 'copies', 'month.']
--------------------------------------------------------
[['analysts', 'forecast', '“cyberpunk”', 'sell', '18', 'million', '25', 'million', 'copies', 'month.'], ['cyberpunk', 'biggest', 'expensive', 'project', 'two-decade-old', 'videogame', 'company.'], ['unusual', 'game', 'publishers', 'suggest', 'players', 'seek', 'refunds', 'big-budget', 'releases.several', 'prominent', 'developers', 'postponed', 'games', 'year,', 'citing', 'work-from-home', 'challenges', 'caused', 'health', 'crisis', 'key', 'factor.'], ['company', 'major', 'franchise,', 'witcher,', 'major', 'release', 'years', 'old.']]


In [99]:
#1b. Approach 2
#Due to repeated operations on each unique string element in the doc, the codes can be shortened in a single function
#Then apply the defined function at one time to obtain final cleansed results

#Define a function for cleansing one single element/document 
def processing_document(element):
    processed_element = remove_stopwords(element).lower().replace(",", "")
    processed_element = processed_element.split()
    return processed_element

doc_vector_split = []
for element in row_doc_ser:
    a = processing_document(element)
    doc_vector_split.append(a)

#print(row_doc_ser[0])
#print(doc_vector_split[0])
print(doc_vector_split)

[['analysts', 'forecast', '“cyberpunk”', 'sell', '18', 'million', '25', 'million', 'copies', 'month.'], ['cyberpunk', 'biggest', 'expensive', 'project', 'two-decade-old', 'videogame', 'company.'], ['it', 'unusual', 'game', 'publishers', 'suggest', 'players', 'seek', 'refunds', 'big-budget', 'releases.several', 'prominent', 'developers', 'postponed', 'games', 'year', 'citing', 'work-from-home', 'challenges', 'caused', 'health', 'crisis', 'key', 'factor.'], ['the', 'company', 'major', 'franchise', 'the', 'witcher', 'major', 'release', 'years', 'old.']]


In [93]:
#Approach 3: Simplified the process of creating final vectors
def process_document(document):

    #Remove stopwords, convert to lower case and remove "?" character
    cleaned_document = remove_stopwords(document.lower()).replace("?","")  
    return cleaned_document.split()
    #print(cleaned_document)
    #print(type(cleaned_document))
    #print(len(cleaned_document))

#Create a document vector (in form of list)
doc_vectors=[process_document(document)
             for document in row_doc]

#print(row_doc[0])
#print(doc_vectors[0])
print(doc_vectors)

[['analysts', 'forecast', '“cyberpunk”', 'sell', '18', 'million', '25', 'million', 'copies', 'month.'], ['cyberpunk', 'biggest', 'expensive', 'project', 'two-decade-old', 'videogame', 'company.'], ['unusual', 'game', 'publishers', 'suggest', 'players', 'seek', 'refunds', 'big-budget', 'releases.several', 'prominent', 'developers', 'postponed', 'games', 'year,', 'citing', 'work-from-home', 'challenges', 'caused', 'health', 'crisis', 'key', 'factor.'], ['company', 'major', 'franchise,', 'witcher,', 'major', 'release', 'years', 'old.']]


Convert into corpora dictionary

In [102]:
corpora_dict = corpora.Dictionary(doc_vector_split)
print(corpora_dict)

print('Inspect words and corresponding id')
print(corpora_dict.token2id)

Dictionary(47 unique tokens: ['18', '25', 'analysts', 'copies', 'forecast']...)
Inspect words and corresponding id
{'18': 0, '25': 1, 'analysts': 2, 'copies': 3, 'forecast': 4, 'million': 5, 'month.': 6, 'sell': 7, '“cyberpunk”': 8, 'biggest': 9, 'company.': 10, 'cyberpunk': 11, 'expensive': 12, 'project': 13, 'two-decade-old': 14, 'videogame': 15, 'big-budget': 16, 'caused': 17, 'challenges': 18, 'citing': 19, 'crisis': 20, 'developers': 21, 'factor.': 22, 'game': 23, 'games': 24, 'health': 25, 'it': 26, 'key': 27, 'players': 28, 'postponed': 29, 'prominent': 30, 'publishers': 31, 'refunds': 32, 'releases.several': 33, 'seek': 34, 'suggest': 35, 'unusual': 36, 'work-from-home': 37, 'year': 38, 'company': 39, 'franchise': 40, 'major': 41, 'old.': 42, 'release': 43, 'the': 44, 'witcher': 45, 'years': 46}


In [107]:
#Create corpus
corpus = []
for element in doc_vector_split:
    a = corpora_dict.doc2bow(element)
    corpus.append(a)
    
#print(corpora)
print(doc_vector_split[0]) 
print(corpus[0])
#The former digit shows the indexed position the words occur in the element(document); the latter, the total amount they occur

['analysts', 'forecast', '“cyberpunk”', 'sell', '18', 'million', '25', 'million', 'copies', 'month.']
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1)]


Build LSI model

In [141]:
from gensim import models, similarities
from gensim.similarities import Similarity

#Model
lsi_model = models.LsiModel(corpus, id2word = corpora_dict)

#Similarity index: Cosine similarity in this case
#sim_index = Similarity('-Similarity Index', corpus = corpus, num_features = len(corpora_dict))
sim_index =  similarities.MatrixSimilarity(corpus)
print(sim_index[lsi_model[corpus]])
print(sim_index)

'''
for ind in sim_index:
    print(ind)
'''

[[ 0.28867513  0.          0.          0.        ]
 [-0.28867513  0.          0.          0.        ]
 [ 0.28867513  0.          0.          0.        ]
 [-0.28867513  0.          0.          0.        ]]
MatrixSimilarity<4 docs, 47 features>


'\nfor ind in sim_index:\n    print(ind)\n'

In [118]:
print(lsi_model[corpus])
print(sim_index)
print(type(sim_index))

<gensim.interfaces.TransformedCorpus object at 0x0000023660F30A48>
MatrixSimilarity<4 docs, 47 features>
<class 'gensim.similarities.docsim.MatrixSimilarity'>


Testing the new data

In [142]:
new_data = "Cyberpunk is the most amazing game ever in this year"#"Joe Biden is the projected winner of 2020 Presidential Election."

#Process the new data
new_data = new_data.split()
print(new_data)
new_corpus = [corpora_dict.doc2bow(new_data)]

#Create an LSI Representation
vec_lsi = lsi_model[new_corpus]  

print(sim_index[new_corpus])

['Cyberpunk', 'is', 'the', 'most', 'amazing', 'game', 'ever', 'in', 'this', 'year']
[[0.         0.         0.2407717  0.30860668]]


In [143]:
#sort an array in reverse order and get indexes
matches=np.argsort(sim_index)[::-1] 
print("Sorted Document index :", matches)

print("\n", "-"*60, "\n")

for i in matches:
    print(sim_index[i], " -> ", row_doc[i])

print("\n", "-"*60, "\n")

Sorted Document index : [[0 1 2 3]
 [0 1 3 2]
 [0 2 3 1]
 [1 2 3 0]]

 ------------------------------------------------------------ 



ValueError: shapes (4,47) and (4,) not aligned: 47 (dim 1) != 4 (dim 0)

In [23]:
for element in row_doc_df[0]:
    print(element)

Analysts had forecast that “Cyberpunk” would sell anywhere from 18 million to 25 million copies in its first month.
Cyberpunk is the biggest and most expensive project yet for the more than two-decade-old videogame company.
It is unusual for game publishers to suggest players seek refunds for their big-budget releases.Several other prominent developers postponed games this year, citing work-from-home challenges caused by the health crisis as a key factor.
The company has only one other major franchise, The Witcher, and its last major release is now five years old.


In [44]:
for element in row_doc_ser:
    print(element)

Analysts had forecast that “Cyberpunk” would sell anywhere from 18 million to 25 million copies in its first month.
Cyberpunk is the biggest and most expensive project yet for the more than two-decade-old videogame company.
It is unusual for game publishers to suggest players seek refunds for their big-budget releases.Several other prominent developers postponed games this year, citing work-from-home challenges caused by the health crisis as a key factor.
The company has only one other major franchise, The Witcher, and its last major release is now five years old.
