<h1>Document Similarity using LSI</h1>

In [1]:
def get_musicians(url):
    from bs4 import BeautifulSoup
    import requests
    page_soup = BeautifulSoup(requests.get(url).content,'lxml')
    li_tags = page_soup.find_all('li')
    all_musicians = list()
    for tag in li_tags:
        if tag.get('id'):
            continue

        try:
            tag.find('sup',class_="reference")
            link = tag.find('a').get('href')
            name = tag.find('a').get_text()
            if "/wiki/" in link and non_musician_finder(link):
                all_musicians.append((name,"https://en.wikipedia.org" + link))
        except:
            pass
    return all_musicians

def non_musician_finder(link):
    non_musician_words = ['Category','Template','Portal','List','File','Template','Special','Main','Help','User']
    for word in non_musician_words:
        if word in link:
            return False
    return True

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_alternative_country_musicians"
#get_musicians(url)

In [3]:
def get_musician_text(url):
    from bs4 import BeautifulSoup
    import requests
    all_text = ''
    try:
        page_soup = BeautifulSoup(requests.get(url).content,'lxml')
        for p_tag in page_soup.find_all('p'):
            all_text += p_tag.get_text()
    except:
        return None
    return all_text


In [4]:
url = "https://en.wikipedia.org/wiki/Jim_Morrison"
#get_musician_text(url)

In [5]:
def get_all_musicians(genre_list):
    all_musicians = list()
    for genre in genre_list:
        url = 'https://en.wikipedia.org/wiki/List_of_' + genre
    
        #Your code here
        all_musicians.extend(get_musicians(url))
    
    return all_musicians

In [6]:
genre_list = ['bluegrass_musicians#G','British_blues_musicians','country_blues_musicians','emo_artists','alternative_country_musicians']
all_musicians = get_all_musicians(genre_list)
all_musicians[:5]

[('Tom Adams', 'https://en.wikipedia.org/wiki/Tom_Adams_(bluegrass_musician)'),
 ('Eddie Adcock', 'https://en.wikipedia.org/wiki/Eddie_Adcock'),
 ('David "Stringbean" Akeman',
  'https://en.wikipedia.org/wiki/David_%22Stringbean%22_Akeman'),
 ('Red Allen', 'https://en.wikipedia.org/wiki/Red_Allen_(bluegrass)'),
 ('Darol Anger', 'https://en.wikipedia.org/wiki/Darol_Anger')]

In [7]:
def get_all_musician_docs(all_musicians):
    musician_names = list()
    musician_texts = list()
    for musician in all_musicians:
        name = musician[0]
        url = musician[1]
        
        #Your code here
        text = get_musician_text(url)
        if text == None:
            continue
        else:
            musician_names.append(name)
            musician_texts.append(text)
    return musician_names,musician_texts
        

In [8]:
reference_names,reference_docs = get_all_musician_docs(all_musicians)

In [9]:
#Code for LSI model goes here
from gensim.similarities.docsim import Similarity
from gensim import corpora, models, similarities
from gensim.parsing.preprocessing import STOPWORDS

texts = [[word for word in document.lower().split()
        if word not in STOPWORDS and word.isalnum()]
        for document in reference_docs]

dictionary = corpora.Dictionary(texts)#(word_id,word)
corpus = [dictionary.doc2bow(text) for text in texts]# (word_id,fequency)for each text

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=5)

In [10]:
musician_genre_list = ['acid_rock_artists']
all_musicians = get_all_musicians(musician_genre_list)
musician_names,musician_docs = get_all_musician_docs(all_musicians)

In [11]:
table_data = list()
for index,musician in enumerate(musician_docs):
    
    #Your similarity code here. Use the in-class notebook as a reference
    vec_bow = dictionary.doc2bow(musician.lower().split())
    vec_lsi = lsi[vec_bow]
    index_fun = similarities.MatrixSimilarity(lsi[corpus])
    sims = index_fun[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    
    most_similar_musician = sims[0][0]
    table_data.append((musician_names[index],reference_names[most_similar_musician]))
    
#Write code to print table_data after the for loop ends
import pandas as pd
df = pd.DataFrame(table_data, columns=['Musician', 'Most Similar Musician'])

In [12]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
df

Unnamed: 0,Musician,Most Similar Musician
0,The 13th Floor Elevators,Fragile Rock
1,Alice Cooper,The Spill Canvas
2,The Amboy Dukes,Joan of Arc
3,Amon Düül,Water Liars
4,Big Brother and the Holding Company,The Pretty Things
5,Black Sabbath,The Anniversary
6,Blue Cheer,Jeff Beck Group
7,Blues Magoos,Pink Anderson
8,The Charlatans,Drive Like Jehu
9,Count Five,Tommy Ramone
