In [1]:
import re
import spacy
from gensim import corpora
# importing stop words
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.corpus import stopwords

In [2]:
# loading tokenizer for english language
spacy_nlp_eng = spacy.load('en_core_web_sm')
spacy_nlp_esp = spacy.load('es_core_news_md')
# create list of punctuations and stopwords
stop_words = spacy.lang.en.stop_words.STOP_WORDS
spanish_stopwords = set(stopwords.words('spanish'))

In [3]:
def tokenizer(sentence, lang): 
    # remove distracting single quotes
    sentence = re.sub('\'','',sentence)
    # remove digits and words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)
    # replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)
    # remove unwanted lines starting from special characters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)    
    # remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)    
    # remove punctuations
    sentence = re.sub(r'[^\w\s]',' ',sentence)
    
    # creating token object
    if lang == "eng":
        tokens = spacy_nlp_eng(sentence)
    elif lang == "esp":
        tokens = spacy_nlp_esp(sentence)
    # lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]    
    # remove stopwords, and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    return tokens

In [4]:
text_eng = "The track features lead singer Lauren Mayberry recalling horrific statements made to her by men — He said, You need to be fed, but keep an eye on your waistline — over looming synthesizers. The track was made in quarantine, while Mayberry and Martin Doherty were in Los Angeles and Iain Cook was in Glasgow, Scotland."

In [5]:
text_eng

'The track features lead singer Lauren Mayberry recalling horrific statements made to her by men — He said, You need to be fed, but keep an eye on your waistline — over looming synthesizers. The track was made in quarantine, while Mayberry and Martin Doherty were in Los Angeles and Iain Cook was in Glasgow, Scotland.'

In [6]:
text_esp = "El trío de pop electrónico de Glasgow formado por Lauren Mayberry, Lan Cook y Martin Doherty estrena su flamante nuevo single He Said, She Said. Con este reivindicativo himno synth pop, CHVRCHES regresan a los ávidos oídos de sus fans, retomando su actividad tras casi dos años de silencio."

In [7]:
text_esp

'El trío de pop electrónico de Glasgow formado por Lauren Mayberry, Lan Cook y Martin Doherty estrena su flamante nuevo single He Said, She Said. Con este reivindicativo himno synth pop, CHVRCHES regresan a los ávidos oídos de sus fans, retomando su actividad tras casi dos años de silencio.'

In [8]:
tokens_eng = tokenizer(text_eng, lang = "eng")
tokens_eng

['track',
 'feature',
 'lead',
 'singer',
 'lauren',
 'mayberry',
 'recall',
 'horrific',
 'statement',
 'man',
 'need',
 'feed',
 'eye',
 'waistline',
 'loom',
 'synthesizer',
 'track',
 'quarantine',
 'mayberry',
 'martin',
 'doherty',
 'los',
 'angeles',
 'iain',
 'cook',
 'glasgow',
 'scotland']

In [9]:
tokens_esp = tokenizer(text_esp, lang = "esp")
tokens_esp

['triar',
 'pop',
 'electrónico',
 'glasgow',
 'formar',
 'por',
 'lauren',
 'mayberry',
 'lan',
 'cook',
 'martin',
 'doherty',
 'estrenar',
 'flamante',
 'nuevo',
 'singlar',
 'said',
 'said',
 'con',
 'este',
 'reivindicativo',
 'himno',
 'synth',
 'pop',
 'chvrches',
 'regresar',
 'ávido',
 'oír',
 'fan',
 'retomar',
 'actividad',
 'tras',
 'casi',
 'año',
 'silenciar']

In [10]:
def generate_corpus(tokens, save_dict = False):
    # create dictionary
    dictionary = corpora.Dictionary()
    # corpus creation 
    corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokens]
    # calcalating words frequency
    word_frequency = [[(dictionary[id], count) for id, count in doc] for doc in corpus]

    # saving dictionary and corpus
    if save_dict == True:
        dictionary.save('dictionary.dict')  
        corpora.MmCorpus.serialize('corpus.mm', corpus)

    return dictionary, corpus, word_frequency

In [11]:
dictionary_eng, corpus_eng, word_frequency_eng = generate_corpus([tokens_eng])

In [12]:
dictionary_esp, corpus_esp, word_frequency_esp = generate_corpus([tokens_esp])

In [13]:
import gensim
from gensim.similarities import MatrixSimilarity

In [14]:
def create_models(dictionary, corpus):
    # load tfidf model
    tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
    # load lsi model and extract corpus to be used in matrixsimilarity function
    lsi_model = gensim.models.LsiModel(tfidf_model[corpus], id2word=dictionary, num_topics=300)
    gensim.corpora.MmCorpus.serialize('lsi_model.mm',lsi_model[tfidf_model[corpus]])  
    lsi_corpus = gensim.corpora.MmCorpus('lsi_model.mm')
    # similarity indexes
    index = MatrixSimilarity(lsi_corpus, num_features = lsi_corpus.num_terms)

    return tfidf_model, lsi_model, index 

In [15]:
def search_similar_words(search_term, dictionary, tfidf_model, lsi_model, index):
    # transform search terms to tokens and create the corpus
    query_bow = [dictionary.doc2bow(cleansing.tokenizer(search_term))]
    # apply tfidf model to the corpus
    query_tfidf = tfidf_model[query_bow]
    # apply lsi model to the tfidf
    query_lsi = lsi_model[query_tfidf]
    # number of elements with the maximum relevance
    index.num_best = 5
    
    return index[query_lsi]

In [16]:
tfidf_model_eng, lsi_model_eng, index_eng = create_models(dictionary_eng, corpus_eng)

  rel_spectrum = np.abs(1.0 - np.cumsum(s / np.sum(s)))
  small = 1 + len(np.where(rel_spectrum > min(discard, 1.0 / k))[0])


ValueError: cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)