<a href="https://colab.research.google.com/github/Elbereth-Elentari/Learning_machine/blob/master/Library_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [68]:
import pandas as pd
import spacy
from spacy_langdetect import LanguageDetector
import pl_core_news_sm
import os
import random
import string
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium .webdriver import ChromeOptions
import re

In [55]:
def get_conditions():
    term = input('What would you like to read about?\n')
    term = term.replace(' ', '+')
    min_year = int(input('What is the oldest book you are interested in?\n'))
    min_length = 100
    max_length = 700
    max_books = int(input('How many books do you want?\n'))
    return term, min_year, min_length, max_length, max_books

In [45]:
def next_or_break(driver):
    try:
        next_button = driver.find_element_by_link_text('Następne>')
        next_button.click()
        return True
    except:
        return 'no next'

In [46]:
class Book:
    interesting_books = set()
    all_authors = set()
    authors_to_scrape = set()

    def __init__(self):
        self.title = ''
        self.author = ''
        self.publisher = ''
        self.year = 0
        self.pages = 0
        self.WD_signature = ''
        self.storage = ''

    def get_book_attributes(self, record):
        title = record.find_element_by_class_name('title').text
        self.title = re.sub(r' / .+', '', title)

        try:
            self.author = record.find_element_by_class_name('author').text
        except:
            pass

        if 'BUW Magazyn' in record.text:
            self.storage = 'magazyn'

        infos = record.find_elements_by_tag_name('tr')
        for info in infos:
            if 'Klasyfikacja WD' in info.text:
                self.WD_signature = info.find_element_by_tag_name('a').text
            elif 'Adres wyd.' in info.text:
                publisher_candidates = info.find_elements_by_tag_name('span')
                for publisher in publisher_candidates:
                    if publisher.get_attribute('class') != 'highlight':
                        publisher_with_colon = re.search(r'.+ : ?(.+),?.*(\d{4})', publisher.text)
                        if publisher_with_colon:
                            publisher = publisher_with_colon
                        else:
                            publisher = re.search(r'.+? (.+),?.*(\d{4})', publisher.text)
                        self.publisher = publisher.group(1)
                        self.year = int(publisher.group(2))
            elif 'Opis fiz.' in info.text:
                pages = info.find_element_by_tag_name('span').text
                pages = re.sub(r'\[.+?\]', '', pages)
                self.pages = int(re.search(r'\d+', pages).group(0))

    def check_quality(self, record):
        if self.year >= min_year and self.pages >= min_length and self.pages <= max_length:
            Book.interesting_books.add(self)
            if len(self.author) > 0 and self.author in Book.all_authors:
                author = record.find_element_by_class_name('author')
                Book.authors_to_scrape.add(author.get_attribute('href'))
            elif len(self.author) > 0:
                Book.all_authors.add(self.author)

In [75]:
def get_books(driver, expand_set):
    records = driver.find_element_by_class_name('records').find_elements_by_tag_name('li')
    for record in records:
        too_old = 'not too old'

        if ('BUW Wolny Dostęp' in record.text or 'BUW Magazyn' in record.text) and 'Adres wyd.' in record.text and 'Opis fiz.' in record.text:
            book = Book()
            book.get_book_attributes(record)
            if expand_set:
                book.check_quality(record)
            if book.year < min_year and book.year > 0:
                too_old = 'too old'
                break
    return too_old

In [48]:
def create_link_set(driver):
    links = set()
    while True:
        tags_from_page = driver.find_element_by_tag_name('tbody').find_elements_by_tag_name('a')
        for tag in tags_from_page:
            try:
                links.add(tag.get_attribute('href'))
            except:
                pass
        if next_or_break(driver) == 'no next': break
    return links

In [87]:
def get_books_from_links(link_set, expand_set=True):
    counter = 1
    for link in link_set:
        print('Progress:', counter/len(link_set))
        driver.get(link)
        select = Select(driver.find_element_by_id('search_sort'))
        select.select_by_value('5')
        too_old = get_books(driver, expand_set)
        while too_old == 'not too old':
            if next_or_break(driver) == 'no next': break
            too_old = get_books(driver, expand_set)
        counter += 1
    return True

In [88]:
term, min_year, min_length, max_length, max_books = get_conditions()

options = ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome('chromedriver', options=options)
driver.get(f'https://chamo.buw.uw.edu.pl/heading/search?match_1=MUST&field_1=heading&term_1={term}&facet_heading_type=subject&sort=heading')

print('Creating initial link set')
links = create_link_set(driver)
print(f'Scraping the {len(links)} initial links')
get_books_from_links(links)
print(f'Scraping {len(Book.authors_to_scrape)} interesting authors')
get_books_from_links(Book.authors_to_scrape, expand_set=False)
driver.close()

interesting_books = [{'title':book.title, 'author':book.author, 'publisher':book.publisher, 'year':book.year, 'pages':book.pages, 'WD_signature':book.WD_signature, 'storage':book.storage} for book in Book.interesting_books]

reading_list = pd.DataFrame(columns=['title','author', 'WD_signature', 'storage', 'publisher', 'year', 'pages'], data=interesting_books)
reading_list = deduplicate_books(reading_list)

output_file = f'/content/drive/My Drive/Library_search/data/{term}_reading_list.tsv'
reading_list[:min(len(reading_list), max_books)].to_csv(output_file, index=False, sep='\t')

What would you like to read about?
spowiedź
What is the oldest book you are interested in?
1990
How many books do you want?
500
Creating initial link set
Scraping the 41 initial links
Progress: 0.024390243902439025
Progress: 0.04878048780487805
Progress: 0.07317073170731707
Progress: 0.0975609756097561
Progress: 0.12195121951219512
Progress: 0.14634146341463414
Progress: 0.17073170731707318
Progress: 0.1951219512195122
Progress: 0.21951219512195122
Progress: 0.24390243902439024
Progress: 0.2682926829268293
Progress: 0.2926829268292683
Progress: 0.3170731707317073
Progress: 0.34146341463414637
Progress: 0.36585365853658536
Progress: 0.3902439024390244
Progress: 0.4146341463414634
Progress: 0.43902439024390244
Progress: 0.4634146341463415
Progress: 0.4878048780487805
Progress: 0.5121951219512195
Progress: 0.5365853658536586
Progress: 0.5609756097560976
Progress: 0.5853658536585366
Progress: 0.6097560975609756
Progress: 0.6341463414634146
Progress: 0.6585365853658537
Progress: 0.682926829

AttributeError: ignored

In [None]:
def merge_tsvs()
  cat = pd.DataFrame()
  data_path = '/content/drive/My Drive/Library_search/data/'
  for file in os.listdir(data_path):
      if file.endswith('.tsv'):
          print('Processing', file)
          c = pd.read_csv(data_path + file, sep='\t')
          cat = cat.append(c, ignore_index=True)
  cat.to_csv(data_path + 'Library_catalogue.tsv', sep='\t', index=False)
  return cat

In [22]:
def detect_language(df):
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

    language = []
    for index, doc in enumerate(nlp.pipe(df['title'].values, batch_size=1000)):
        if index % 100 == 0:
            print('Progress:', index/len(df['title']))
        if doc.is_parsed:
            if doc._.language['language'] == 'en':
                language.append('en')
            else:
                language.append('pl')
        else:
            language.append('')

    df['language'] = language
    for language in df.language.unique():
      sample = random.sample(list(df[df['language'] == language].index), 20)
      print(f'{language} sample')
      for row in df[df.index.isin(sample)].itertuples():
          print(row.title)
    return df

In [23]:
def deduplicate_books(df):
    df.drop_duplicates(inplace=True)
    df.sort_values(by='year', ascending=False, inplace=True)
    df.drop_duplicates(subset='title', inplace=True)
    return df

In [78]:
def preprocess(file):
    cat = pd.read_csv(file, sep='\t')
    cat = deduplicate_books(cat)
    cat = detect_language(cat)
    preprocessed_cat = pd.DataFrame()

    for lang in ['pl', 'en']:
        print(f'Preprocessing {lang}')
        if lang == 'en':
            nlp = spacy.load('en_core_web_sm')
        elif lang == 'pl':
            nlp = pl_core_news_sm.load()

        df = cat[cat['language'] == lang]
        stopwords = nlp.Defaults.stop_words

        preprocessed = []
        for index, doc in enumerate(nlp.pipe(df['title'].values, batch_size=200)):
            if index % 100 == 0:
                print('Progress:', index/len(df))
            if doc.is_parsed:
                tokens = [token.lemma_.lower() for token in doc if (token.text.lower() not in stopwords and token.text not in string.punctuation)]
                if len(tokens) > 0:
                    preprocessed.append(tokens)
                else:
                    preprocessed.append('preprocessing_fail')
            else:
                print('Preprocessing failed')
                preprocessed.append('preprocessing_fail')

        df['tokens'] = preprocessed
        df = df[df['tokens'] != 'preprocessing_fail']
        preprocessed_cat = preprocessed_cat.append(df)
    return preprocessed_cat

In [None]:
preprocessed_cat = preprocess('/content/drive/My Drive/Library_search/data/Library_catalogue.tsv')
preprocessed_cat.to_csv(f'/content/drive/My Drive/Library_search/data/Preprocessed_catalogue.tsv', index=False, sep='\t')

In [None]:
tag = preprocess(f'/content/drive/My Drive/Library_search/data/{term}_reading_list.tsv')
tag_en = tag[tag['language'] == 'en']
tag_en['tokens'] = tag_en['tokens'].apply(lambda x: x+[term])
en = preprocessed_cat[preprocessed_cat['language'] == 'en']
for index, row in tag_en.iterrows():
    en.drop(en[en['title'] == row['title']].index, inplace=True)

In [None]:
nlp = spacy.load('en_core_web_sm')
topic = ' '.join([' '.join(tokens) for tokens in tag_en['tokens']])
topic = nlp(topic)

In [None]:
similarity = []
for index, doc in enumerate(nlp.pipe(en['title'].values, batch_size=200)):
    if index % 100 == 0:
        print('Progress:', index/len(en))
    if doc.is_parsed:
        similarity.append(topic.similarity(doc))
    else:
        print('Similarity failed')
        preprocessed.append('similarity_fail')

In [None]:
en['similarity'] = similarity
en.sort_values(by='similarity', inplace=True, ascending=False)
for row in en[:5].itertuples():
    print(row.title, row.similarity)