In [3]:
# Script to read books from https://www.libreriainternacional.com/Libros

import requests
from requests.auth import HTTPProxyAuth
import re

import pandas as pd
import json
import random

import traceback

import time

from bs4 import BeautifulSoup

from pathlib import Path
import concurrent.futures

import datetime

current_proxy = 0

def get_proxy():
    global current_proxy
    i = current_proxy
    current_proxy = (current_proxy+1) % len(proxylist)
    
    # print('Proxy used: ', i)
    # proxies = {"http":proxylist['ip'][i]+':'+str(proxylist['port'][i]), "https":proxylist['ip'][i]+':'+str(proxylist['port'][i])}
    proxies = {"http": proxylist['ip'][i] + ':' + str(proxylist['port_http'][i])}
    auth = HTTPProxyAuth(proxylist['login'][i], proxylist['password'][i])

    return proxies, auth

proxylist = pd.read_csv('proxylist.csv', sep='\t')

In [None]:
# Dataframe for all books
all_books = pd.DataFrame()
endpoint = "https://www.libreriainternacional.com/Libros/GetBooks"
skip = 0

try:
    while True:

        body = {
          "AlphaFilterId": -1,
          "LanguageId": -1,
          "authorId": -1,
          "editorialId": -1,
          "skip": skip,
          "subthemeId": -1,
          "take": 12,
          "themeId": -1
        }

        proxies, auth = get_proxy()
        res = requests.post(endpoint, json=body, proxies=proxies, auth=auth)

        if not res.ok:
            print('HTTP error:', res.status_code, res._content)
            break
        else:
            res_json = json.loads(res._content)
            # Exit loop if no record found
            if len(res_json) == 0:
                break
            #print(res_json) 
            df = pd.json_normalize(res_json)
            all_books = all_books.append(df, ignore_index=True)
            print('Total book number:', len(all_books))
        
        skip += 12
        time.sleep(1)
        
    all_books.to_csv('datasets/libreriainternacional.csv')
    print(len(all_books), 'BOOKS SAVED', )

except Exception as e:
    print('Exception', e)
    traceback.print_exc()
    all_books.to_csv('datasets/libreriainternacional.csv')


In [167]:
def isbn_check(isbn):
    if type(isbn) == float:
        isbn = '{:.0f}'.format(isbn)
    elif type(isbn) == int:
        isbn = '{:d}'.format(isbn)
        
    # Remove dashes
    isbn = isbn.replace('-','')
    
    if len(isbn) == 10 and re.match(r'^[0-9]*$', isbn[0:9]) is not None:
        isbn = '978' + isbn[0:9] + '0'
        
    if len(isbn) == 13 and re.match(r'^[0-9]*$', isbn) is not None:
        check = 0
        for i, ch in enumerate(isbn[0:12]):
            check += int(ch) * (1 + i % 2 * 2)

        check = check % 10
        if check > 0:
            check = 10 - check

        return isbn[0:12]+str(check)
    else:
        return ''
    
proxylist = pd.read_csv('proxylist.csv', sep='\t')
current_proxy = 0

def get_proxy():
    global current_proxy
    i = current_proxy
    current_proxy = (current_proxy+1) % len(proxylist)
    
    # print('Proxy used: ', i)
    # proxies = {"http":proxylist['ip'][i]+':'+str(proxylist['port'][i]), "https":proxylist['ip'][i]+':'+str(proxylist['port'][i])}
    proxies = {"http": proxylist['ip'][i] + ':' + str(proxylist['port_http'][i])}
    auth = HTTPProxyAuth(proxylist['login'][i], proxylist['password'][i])

    return proxies, auth

In [58]:
# Function to scrap one book from www.libreriainternacional.com
from bs4 import BeautifulSoup
from bs4 import NavigableString, Tag

def text_or_tag(e):
    if isinstance(e, Tag):
        return e.get_text()
    else:
        return e
        

books = pd.DataFrame()

for i in range(len(all_books)):
    endpoint = 'https://www.libreriainternacional.com/Product/Detail/%s' % all_books['id'][i]
    #endpoint = 'https://www.libreriainternacional.com/Product/Detail/%d' % i

    proxies, auth = get_proxy()
    res = requests.get(endpoint, proxies=proxies, auth=auth)

    if not res.ok:
        print('HTTP error:', res.status_code, res._content)
    else:
        soup = BeautifulSoup(res._content, 'html.parser')

        author, publisher, genre, year, title, isbn, language, description = '', '', '', '', '', '', '', ''

        details = soup.find('div', class_='product-detail-box')
        if details is not None:
            title_el = details.find('h1', class_='product-title')
            if title_el is not None:
                title = title_el.text.strip()

            author_el = details.find('span', string='Autor:')
            if author_el is not None:
                author = author_el.next_sibling.strip()

            publisher_el = details.find('span', string='Editorial:')
            if publisher_el is not None:
                publisher = publisher_el.next_sibling.strip()

            genre_el = details.find('span', string='Tema(s):')
            if genre_el is not None:
                genre = genre_el.next_sibling.strip()

            year_el = details.find('span', string='Fecha de publicación:')
            if year_el is not None:
                year = year_el.next_sibling.strip()

            isbn_el = details.find('span', string='Código:')
            if isbn_el is not None:
                isbn = isbn_el.next_sibling.strip()

            language_el = details.find('span', string='Idioma:')
            if language_el is not None:
                language = language_el.next_sibling.strip()

        description_el = soup.find('div', class_='article-container style-1')
        if description_el is not None:
            #print(description_el, '\n')
            #print('==========================================')
            head_el = description_el.find('h4', text=re.compile(r'^Autor:.*$'))
            if head_el is not None:
                #print(head_el, '\n')
                #print('==========================================')
                description = ''.join([text_or_tag(e) for e in head_el.next_siblings]).strip()

        #print(description)
        #print('==========================================')
        row = pd.DataFrame({'id': all_books['id'][i], 
                            'image': 'https://www.libreriainternacional.com' + all_books['image'][i], 
                            'author': author, 
                            'publisher': publisher, 
                            'genre': genre, 
                            'year': year, 
                            'title': title, 
                            'isbn': isbn, 
                            'language': language, 
                            'description': description}, index=[0])

        books = books.append(row, ignore_index=True)
        time.sleep(0.2)
        #print(id, image, author, publisher, genre, year, title, isbn, language, description)
        
        if (i+1) % 100 == 0:
            print(i+1, 'books collected')
            books.to_csv('datasets/libreriainternacional-details.csv')

print(i+1, 'books collected')
books.to_csv('datasets/libreriainternacional-details.csv')

books.head()

100 books collected
200 books collected
300 books collected
400 books collected
500 books collected
600 books collected
700 books collected
800 books collected
900 books collected
1000 books collected
1100 books collected
1200 books collected
1300 books collected
1400 books collected
1500 books collected
1600 books collected
1700 books collected
1800 books collected
1900 books collected
2000 books collected
2100 books collected
2200 books collected
2300 books collected
2400 books collected
2500 books collected
2600 books collected
2700 books collected
2800 books collected
2900 books collected
3000 books collected
3100 books collected
3200 books collected
3300 books collected
3400 books collected
3500 books collected
3600 books collected
3700 books collected
3800 books collected
3900 books collected
4000 books collected
4100 books collected
4200 books collected
4300 books collected
4400 books collected
4500 books collected
4600 books collected
4700 books collected
4800 books collected
4

Unnamed: 0,id,image,author,publisher,genre,year,title,isbn,language,description
0,181101,https://www.libreriainternacional.com/img/prod...,,EDAF,Espiritualidad,1900,LIBRO TIBETANO DE LOS MUERTOS,9788441401761,Español,"El Bardo-Thödol se atribuye a Padmasambhava, e..."
1,180445,https://www.libreriainternacional.com/img/prod...,,LO SCARABEO,Tarots,2014,TAROT PAMELA RIDER WAITE (EX175),9788883959110,,22 Arcanos Mayores y 56 Arcanos Menores con in...
2,178487,https://www.libreriainternacional.com/img/prod...,,INNER TRADITIONS,Tarots,2011,ORACULO MAYA,9781594773921,Español,Un instrumento práctico para aprovechar el pot...
3,178344,https://www.libreriainternacional.com/img/prod...,,GAIA EDICIONES,Nueva era,2012,CONSTELAR LA ENFERMEDAD DESDE LAS COMPRESIONES...,9788484454090,Español,Brigitte Champetier de Ribes abre un camino in...
4,176370,https://www.libreriainternacional.com/img/prod...,,HERDER,Diccionarios,1900,PROGRAMM EJERCICIOS ALEMÁN PARA HISPANOPARLANTES,9788425418594,Alemán / Español,Programm Ejercicios/Soluciones completa y desa...


In [1]:
genres = {
#'Arte': "/libros/arte/101000000",  
#'Autoayuda y Espiritualidad': "/libros/autoayuda-y-espiritualidad/102000000", 
#'Ciencias Humanas': "/libros/ciencias-humanas/104000000", 
#'Ciencias Políticas y Sociales': "/libros/ciencias-politicas-y-sociales/105000000",
#'Ciencias': "/libros/ciencias/103000000",
#'Cocina': "/libros/cocina/106000000",
#'Cómics manga infantil y juvenil': "/libros/comics-manga-infantil-y-juvenil/412000000",
#'Cómics': "/libros/comics/411000000", 
#'Deportes y juegos': "/libros/deportes-y-juegos/108000000",
#'Derecho': "/libros/derecho/109000000",
#'Economía': "/libros/economia/110000000", 
#'Empresa': "/libros/empresa/111000000", 
#'Filología': "/libros/filologia/112000000",
#'Fotografía': "/libros/fotografia/113000000",
#'Guías de viaje': "/libros/guias-de-viaje/114000000",
#'Historia': "/libros/historia/115000000", 
#'Idiomas': "/libros/idiomas/116000000", 
#'Infantil': "/libros/infantil/117000000",
#'Informática': "/libros/informatica/118000000",
#'Ingeniería': "/libros/ingenieria/119000000", 
#'Juvenil': "/libros/juvenil/117001014", 
#'Libros de Texto y Formación': "/libros/libros-de-texto-y-formacion/132000000", 
#'Libros latinoamericanos': "/libros/libros-latinoamericanos/417000000", 
#'Literatura': "/libros/literatura/121000000", 
#'Manualidades': "/libros/manualidades/122000000",
#'Medicina': "/libros/medicina/123000000", 
#'Música': "/libros/musica/124000000", 
'Narrativa histórica': "/libros/narrativa-historica/125000000", 
'Novela contemporánea': "/libros/novela-contemporanea/128000000", 
'Novela negra': "/libros/novela-negra/126000000", 
'Oposiciones': "/libros/oposiciones/129000000",
'Psicología y Pedagogía': "/libros/psicologia-y-pedagogia/130000000", 
'Romántica y erótica': "/libros/romantica-y-erotica/416000000", 
'Salud y Dietas': "/libros/salud-y-dietas/131000000" 
}


In [161]:
# Script to read books from 
from pathlib import Path
import concurrent.futures

def scrap_page(endpoint):
    books = pd.DataFrame()

    proxies, auth = get_proxy()
    res = requests.get(endpoint, proxies=proxies, auth=auth)

    if res.ok:
        soup = BeautifulSoup(res._content, 'html.parser')

        #author, publisher, genre, year, title, isbn, language, description = '', '', '', '', '', '', '', ''

        #books_els = soup.findAll('div', {'class': 'row', 'index': True}, partial=False)
        books_els = soup.findAll(lambda tag: tag.name == 'div' and tag.get('class') == ['row'] and tag.get('index') is not None)

        for b in books_els:
            authors, link, isbn, image, title, description = [], '', '', '', '', ''

            title_el = b.find('a', class_='title')
            if title_el is not None:
                title = title_el.get_text().strip()
                link = 'https://www.casadellibro.com' + title_el['href']

            author_els = b.findAll('a', class_='author')
            if author_els is not None:
                authors = [a.get_text().strip() for a in author_els]

            image_el = b.find('img')
            if image_el is not None:
                image = image_el['data-src']
                isbn = Path(image).stem
                if isbn.startswith('defecto'):
                    image = ''
                    isbn = ''


            description_el = b.find('div', class_='short')
            if description_el is not None:
                description = description_el.get_text().strip()
                description = description.encode('utf-8', 'replace').decode('utf-8')

            #print(authors, link, isbn, image, title, description)

            row = pd.DataFrame({'link': link, 
                                'image': image, 
                                'authors': ';'.join(authors), 
                                'title': title, 
                                'isbn': isbn, 
                                'description': description}, index=[0])

            books = books.append(row, ignore_index=True)
        return res, books
    elif res.status_code == 404:     
        return res, None
    else:
        return res, None
    

def scrap_topic(g):
    topic = genres[g]
    topic_name = topic.split('/')[-2]
    print(topic_name, ':', '>>>>>>>>>>>>>>>>>>>>>>>> START SCRAPPING: %s' % g)
    
    books = pd.read_csv('datasets/casadellibro_%s.csv' % topic_name)
    i = len(books) // 60 + 1
    
    errors = 0
    missing = []
    
    while True:
        if errors >= 10:
            break
            
        endpoint = "https://www.casadellibro.com%s/p%d" % (topic, i);
        res, rows = scrap_page(endpoint)

        if res.ok:
            errors = 0 
            books = books.append(rows, ignore_index=True)
        elif res.status_code == 404:     
            print(topic_name, ':', 'TOPIC completed')
            break
        else:
            print(topic_name, ':', 'HTTP error [%d] on page %s' % (res.status_code, endpoint))
            errors += 1
            missing.append(i)
        
        if i % 500 == 0:
            print(len(books), 'books collected')
            books.to_csv('datasets/casadellibro_%s.csv' % topic_name)

        i += 1
        time.sleep(1.1)
        
    print(topic_name, ':', len(books), 'books collected')
    books.to_csv('datasets/casadellibro_%s.csv' % topic_name)
    print(topic_name, ':', 'Missing pages:', missing)

    for l in range(3): 
        missing_pages[g] = [p for p in missing]
        missing = []
        
        # Quit loop if no missing pages
        if len(missing_pages[g]) == 0:
            break
            
        for i in missing_pages[g]:
            endpoint = "https://www.casadellibro.com%s/p%d" % (topic, i);
            res, rows = scrap_page(endpoint)
            if res.ok:
                books = books.append(rows, ignore_index=True)
            elif res.status_code == 404:     
                break
            else:
                missing.append(i)
            
    print(topic_name, ':', len(books), 'books collected')
    books.to_csv('datasets/casadellibro_%s.csv' % topic_name)
    print(topic_name, ':', 'Missing pages:', missing)

    return


def scrap_missing(g):
    topic = genres[g]
    topic_name = topic.split('/')[-2]
    print(topic_name, ':', '>>>>>>>>>>>>>>>>>>>>>>>> MISSING PAGES: %s' % g)
    print(topic_name, ':', 'Missing pages:', missing_pages[g])
    
    books = pd.read_csv('datasets/casadellibro_%s.csv' % topic_name)

    for l in range(3): 
        missing = []
        
        # Quit loop if no missing pages
        if len(missing_pages[g]) == 0:
            break
            
        for i in missing_pages[g]:
            endpoint = "https://www.casadellibro.com%s/p%d" % (topic, i);
            res, rows = scrap_page(endpoint)
            if res.ok:
                books = books.append(rows, ignore_index=True)
            elif res.status_code == 404:     
                break
            else:
                missing.append(i)
                
        missing_pages[g] = missing
            
    print(topic_name, ':', len(books), 'books collected')
    books.to_csv('datasets/casadellibro_%s.csv' % topic_name)
    print(topic_name, ':', 'Missing pages:', missing_pages[g])

    return
  
#missing_pages = {}

# Multy-threaded
#with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
#    executor.map(thread_function, range(3))

with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    executor.map(scrap_topic, [page for page in genres])
    
#with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
#    executor.map(scrap_missing, [page for page in missing_pages if len(missing_pages[page]) > 0])
#print('<<<<<<<<<<<<<<< MISSING DONE', missing_pages)

#for m in missing_pages:
#    print('\'%s\': [%s],' % (m, ','.join([str(i) for i in missing_pages[m]])))

novela-contemporanea : >>>>>>>>>>>>>>>>>>>>>>>> START SCRAPPING: Novela contemporánea


  self._target(*self._args, **self._kwargs)


660054 books collected
689953 books collected
novela-contemporanea : TOPIC completed
novela-contemporanea : 712049 books collected
novela-contemporanea : Missing pages: []
novela-contemporanea : 712049 books collected
novela-contemporanea : Missing pages: []


In [163]:
# Consolidate all spanish books
all_books = pd.DataFrame()

for g in genres: 
    topic = genres[g]
    topic_name = topic.split('/')[-2]
    books = pd.read_csv('datasets/casadellibro_%s.csv' % topic_name, usecols = ['link','image','authors','title','isbn','description'])
    print('Read %s = %d books' % (g, len(books)))
    
    all_books = all_books.append(books, ignore_index=True, sort=False)
    
print(len(all_books))

Read Arte = 87707 books
Read Autoayuda y Espiritualidad = 36387 books
Read Ciencias Humanas = 168693 books
Read Ciencias Políticas y Sociales = 103238 books
Read Ciencias = 19303 books
Read Cocina = 18306 books
Read Cómics manga infantil y juvenil = 9852 books
Read Cómics = 38518 books
Read Deportes y juegos = 28794 books


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Read Derecho = 85679 books
Read Economía = 42142 books
Read Empresa = 45850 books
Read Filología = 154907 books
Read Fotografía = 12326 books
Read Guías de viaje = 41133 books


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Read Historia = 411001 books
Read Idiomas = 69217 books
Read Infantil = 175599 books
Read Informática = 29364 books
Read Ingeniería = 25705 books
Read Juvenil = 55809 books
Read Libros de Texto y Formación = 114991 books
Read Libros latinoamericanos = 18 books


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Read Literatura = 243657 books
Read Manualidades = 6148 books
Read Medicina = 41288 books
Read Música = 20212 books
Read Narrativa histórica = 136174 books
Read Novela contemporánea = 712049 books
Read Novela negra = 26959 books
Read Oposiciones = 28308 books
Read Psicología y Pedagogía = 83475 books
Read Romántica y erótica = 9179 books
Read Salud y Dietas = 15624 books
3097612


In [3]:
def isbn_check(isbn, link):
    if type(isbn) == float:
        isbn = '{:.0f}'.format(isbn)
    elif type(isbn) == int:
        isbn = '{:d}'.format(isbn)

    if type(isbn) == str and isbn[-2:] == '.0':
        isbn = isbn[:-2]

    # Restore ISBN from link
    if isbn == '':
        isbn = link.split('/')[-2]
        if not isbn.startswith('978'):
            isbn = ''

    # Remove dashes
    isbn = isbn.replace('-','')

    if len(isbn) == 10 and re.match(r'^[0-9]*$', isbn[0:9]) is not None:
        isbn = '978' + isbn[0:9] + '0'

    if len(isbn) == 13 and re.match(r'^[0-9]*$', isbn) is not None:
        check = 0
        for i, ch in enumerate(isbn[0:12]):
            check += int(ch) * (1 + i % 2 * 2)

        check = check % 10
        if check > 0:
            check = 10 - check

        return isbn[0:12]+str(check)
    else:
        return ''

#topic = genres['Derecho']
#topic = genres['Literatura']
#topic = genres['Historia']

total_count = 0

for g in genres: 
    topic = genres[g]
    topic_name = topic.split('/')[-2]
    b = pd.read_csv('datasets/casadellibro_%s.csv' % topic_name, usecols = ['link','image','authors','title','isbn','description'])

    total = len(b)

    b.fillna(value='', inplace=True)
    #b.isbn = b.isbn.apply(isbn_check)
    b.isbn = b.apply(lambda x: isbn_check(x['isbn'], x['link']), axis=1)
    #no_isbn = b[b.isbn == '']
    
    b = b[b.isbn != '']
    b = b[~b.duplicated('isbn')]
    
    print(g, 'complete unique records', len(b), 100 * len(b) // total, '%')
    
    total_count += len(b)

    b.to_csv('datasets/casadellibro_%s.csv' % topic_name)
    
print('TOTAL BOOKS:', total_count)

Arte complete unique records 74971 85 %
Autoayuda y Espiritualidad complete unique records 34966 96 %
Ciencias Humanas complete unique records 155081 91 %
Ciencias Políticas y Sociales complete unique records 95482 92 %
Ciencias complete unique records 19126 99 %
Cocina complete unique records 16492 90 %
Cómics manga infantil y juvenil complete unique records 9025 91 %
Cómics complete unique records 30957 80 %
Deportes y juegos complete unique records 24091 83 %
Derecho complete unique records 74971 87 %
Economía complete unique records 38416 91 %
Empresa complete unique records 41994 91 %
Filología complete unique records 139202 89 %
Fotografía complete unique records 10953 88 %
Guías de viaje complete unique records 34143 83 %
Historia complete unique records 366560 89 %
Idiomas complete unique records 58160 84 %
Infantil complete unique records 153487 87 %
Informática complete unique records 24701 84 %
Ingeniería complete unique records 23129 89 %
Juvenil complete unique records 543

In [None]:
import datetime
from pathlib import Path
import concurrent.futures

def scrap_details(endpoint):
    global count
    proxies, auth = get_proxy()
    res = requests.get(endpoint, proxies=proxies, auth=auth)

    if res.ok:
        soup = BeautifulSoup(res._content, 'html.parser')

        image, pages, author, publisher, genre, subgenre, year, title, isbn, language, description = '', '', '', '', '', '', '', '', '', '', ''

        genre_anchor = soup.find('a', text = 'Escribe tu opinión')
        if genre_anchor is not None:
            genre_div = genre_anchor.findNext('div')
            if genre_div is not None:
                genre_div = genre_div.findNext('div')
                if genre_div is not None:
                    genre_a = genre_div.find('a')
                    if genre_a is not None:
                        genre = genre_a.get_text().strip()
                        # Find sub genre
                        genre_a = genre_a.findNext('a')
                        if genre_a is not None:
                            subgenre = genre_a.get_text().strip()
                            
        isbn_anchor = soup.find('meta', {'property': 'book:isbn'})
        if isbn_anchor is not None:
            isbn = isbn_anchor['content']
        
        title_el = soup.find('title')
        if title_el is not None:
            title = title_el.get_text().strip()
            if title is not None:
                title = title.split('|')[0]

        author_anchor = soup.find('meta', {'property': 'book:author'})
        if author_anchor is not None:
            author = author_anchor['content']

        year_anchor = soup.find('meta', {'property': 'book:release_date'})
        if year_anchor is not None:
            year = year_anchor['content'][:4]

        publisher_anchor = soup.find('div', text = 'Editorial:')
        if publisher_anchor is not None:
            publisher_div = publisher_anchor.findNext('div')
            if publisher_div is not None:
                publisher = publisher_div.get_text().strip()
        
        language_anchor = soup.find('div', text = 'Idioma:')
        if language_anchor is not None:
            language_div = language_anchor.findNext('div')
            if language_div is not None:
                language = language_div.get_text().strip()
            
        pages_anchor = soup.find('div', text = 'Nº de páginas:')
        if pages_anchor is not None:
            pages_div = pages_anchor.findNext('div')
            if pages_div is not None:
                pages = pages_div.get_text().strip()

        description_div = soup.find('div', class_ = 'resume-body')
        if description_div is not None:
            description = description_div.get_text().strip()
            description = description.encode('utf-8', 'replace').decode('utf-8')

        image_el = soup.find('img', class_ = 'product-image')
        if image_el is not None:
            image = image_el['src']
            name = Path(image).stem
            if name.startswith('defecto'):
                image = ''

        if isbn == '':
            isbn_anchor = soup.find('div', text = 'ISBN:')
            if isbn_anchor is not None:
                isbn_div = isbn_anchor.findNext('div')
                if isbn_div is not None:
                    isbn = isbn_div.get_text().strip()

        if year == '':
            year_anchor = soup.find('div', text = 'Año de edición:')
            if year_anchor is not None:
                year_div = year_anchor.findNext('div')
                if year_div is not None:
                    year = year_div.get_text().strip()
                    
            
        books.append({'genre': genre, 
                      'subgenre': subgenre, 
                      'authors': author, 
                      'link': endpoint, 
                      'title': title, 
                      'isbn': isbn, 
                      'image': image, 
                      'year': year, 
                      'publisher': publisher, 
                      'language': language, 
                      'description': description})
        
        print("\rBooks scanned: %d   " % count, end="")
        count += 1
        
        return res
    else:
        errors.append({'code': res.status_code, 
                      'link': endpoint})
        print('\rRequest error %d for [%s]\n' % (res.status_code, endpoint), end="")
        return res

    
i = 0
for g in genres: 
    topic = genres[g]
    topic_name = topic.split('/')[-2]

    print('\r%s %s\n' % (datetime.datetime.fromtimestamp(time.time()).strftime('%H:%M:%S'), g), end="")
     
    df_links = pd.read_csv('datasets/casadellibro_%s.csv' % topic_name, usecols = ['link'])
    links = df_links['link'].tolist()
    print('\r %d books to scan\n' % len(links), end="")

    #df_books = pd.read_csv('datasets/details_casadellibro_%s.csv' % topic_name, usecols = ['genre','subgenre','authors','title','isbn','image','year','publisher','language','description'])
    #df_books= pd.DataFrame()

    books = []
    errors = []
    count = 0
    
    while True:
        with concurrent.futures.ThreadPoolExecutor(max_workers=25) as executor:
            executor.map(scrap_details, links[i*10000:(i+1)*10000])

        time.sleep(5)
            
        print('\r %d details collected. %d %% errors\n' % (len(books), 100 * len(errors) // len(links)), end="")

        #df_add_books = pd.DataFrame.from_dict(books)
        #df_books = df_books.append(df_add_books, ignore_index=True, sort=False)
        df_books = pd.DataFrame.from_dict(books)
        df_errors = pd.DataFrame.from_dict(errors)

        df_books.to_csv('datasets/details_casadellibro_%s.csv' % topic_name)    
        df_errors.to_csv('datasets/errors_casadellibro_%s.csv' % topic_name)    
        #res, book = scrap_details(endpoint)

        i += 1
        if i * 10000 > len(links):
            i = 0
            break
    i = 0

03:27:25 Narrativa histórica
 127169 books to scan
Request error 410 for [https://www.casadellibro.com/libro-ramses-t1-le-fils-de-la-lumiere-le-temple-des-milions-dannees--la-bataille-de-kadesh/9782221087886/618035]
Request error 410 for [https://www.casadellibro.com/libro-ramses-v-sous-l-acacia-d-occident/9782266073394/619724]
Request error 410 for [https://www.casadellibro.com/libro-ramses-tiv-la-dame-d-abou-simbel/9782266073387/608007]
Request error 410 for [https://www.casadellibro.com/libro-le-grand-voyage-les-enfants-de-la-terre-4-1/9782266122153/844066]
Request error 410 for [https://www.casadellibro.com/libro-le-lit-d-alienor-ii/9782266126885/911538]
Request error 410 for [https://www.casadellibro.com/libro-ospiritual-selfies/9781640037779/9703270]
Request error 410 for [https://www.casadellibro.com/libro-dubliners/9781420961324/9715667]
Request error 410 for [https://www.casadellibro.com/libro-what-is-a-healthy-church/9781940009346/9714322]
Request error 410 for [https://www.c

Request error 410 for [https://www.casadellibro.com/libro-du-bois-dont-on-fait-les-pipes/9782265071513/803978]
Request error 410 for [https://www.casadellibro.com/libro-le-pharaon-noir/9782266106689/811417]
Request error 410 for [https://www.casadellibro.com/libro-la-neveu-damerique/9782864242222/604779]
Request error 410 for [https://www.casadellibro.com/libro-napoleon-le-chant-du-depart/9782266080552/739082]
Request error 410 for [https://www.casadellibro.com/libro-contes-damour-de-folie-et-de-mort/9782864243618/742763]
 9969 details collected. 0 % errors
Request error 410 for [https://www.casadellibro.com/libro-jacques-brel-j-attends-la-nuit/9782862748429/767055]
Request error 410 for [https://www.casadellibro.com/libro-le-quai-de-wigan/9782264030344/745670]
Request error 410 for [https://www.casadellibro.com/libro-la-louve-du-noirmont/9782266100335/745518]
Request error 410 for [https://www.casadellibro.com/libro-les-ailes-du-corbeau/9782264032997/775000]
Request error 410 for [htt

Request error 403 for [https://www.casadellibro.com/libro-correspondence-of-king-james-vi-of-scotland-with-sir-robert-cecil-and-others-in-england-during-the-reign-of-queen-elizabeth-with-an-appendix-containing-papers-illustrative-of-transactions-between-king-james-and-robert-earl-of-essex-principally-pub-fo/9789353807856/10016924]
 179911 details collected. 0 % errors
Request error 404 for [https://www.casadellibro.com/libro-/9789523572683/10112651]
Request error 404 for [https://www.casadellibro.com/libro-/9789523572690/10112652]
Request error 500 for [https://www.casadellibro.com/libro-the-prairie-schooner/9781010944447/10115187]
Request error 410 for [https://www.casadellibro.com/libro-xu-dong-vat---tan-truyen/9781726422277/7347195]
 189907 details collected. 0 % errors
 199907 details collected. 0 % errors
 209907 details collected. 0 % errors
 219906 details collected. 0 % errors
 229906 details collected. 0 % errors
 239906 details collected. 0 % errors
 249906 details collected.

Books scanned: 553959   

In [15]:
#df_books = pd.read_csv('datasets/errors_casadellibro_literatura-1.csv', usecols = ['genre','subgenre','authors','link','title','isbn','image','year','publisher','language','description'])
df_books = pd.read_csv('datasets/errors_casadellibro_literatura-1.csv', usecols = ['code','link'])
df_add = pd.DataFrame.from_dict(errors)

df_books = df_books.append(df_add, ignore_index=True)

df_books.to_csv('datasets/errors_casadellibro_literatura.csv')    
