In [132]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import logging
import concurrent.futures
import threading
from time import sleep

In [133]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [134]:
def get_response(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    return response

In [135]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [136]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [137]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [138]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))

In [139]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price

In [140]:
def get_score(soup):
    soup = soup.find('div', {'class': 'col-md-7'}).find('li', {'class': 'pull-left'}).find('div',
                                                                                           {'class': 'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating


In [141]:
def get_publisher(soup):
    try:
        publisher_tag = soup.select('div.prodoct-attribute-items:nth-child(1) > a')[0]
        publisher_link=publisher_tag.get('href')
        publisher_id=publisher_link.split('/')[2].split('-')[0]
        publisher_name=publisher_tag.text.strip()
    except Exception:
        publisher_link=-1
        publisher_id=-1
        publisher_name=-1
    return {'id':publisher_id,'name':publisher_name,'link':publisher_link}

In [142]:
def get_author(soup):
    authors_list=[]          
    try:
        authors_a_tag = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items > a')
        if(len(authors_a_tag))==0:
                return authors_list
        for author_a_tag in authors_a_tag:
                author_link=author_a_tag.get('href')
                author_id=author_link.split('/')[2].split('-')[0]
                author_name=author_a_tag.text.strip()
                authors_list.append({'id':author_id,'name':author_name,'link':author_link})
    except Exception:
        author = -1
    return authors_list

In [143]:
def is_author_available(soup):
    try:
        existence = soup.select('.pull-left+ li span')[0].text
    except:
        existence = None
        logging.exception("This book has no author!")
    return existence


In [144]:
def get_book_attribute(soup):
    rows = soup.find('table', {'class': 'product-table'}).findAll('td')
    code = -1
    isbn = -1
    size = -1
    pages = -1
    per_cal = -1
    ad_cal = -1
    material = -1
    series = -1
    send_time = -1
    code_flag = 0
    isbn_flag = 0
    size_flag = 0
    pages_flag = 0
    per_cal_flag = 0
    ad_cal_flag = 0
    material_flag = 0
    series_flag = 0
    send_time_flag = 0

    for row in rows:
        text = row.text.strip()
        if code_flag == 1:
            code = int(text)
            code_flag = 0
        elif isbn_flag == 1:
            isbn = text
            isbn = re.sub('[^0-9-]', '', isbn)
            isbn_flag = 0
        elif size_flag == 1:
            size = text
            size_flag = 0
        elif pages_flag == 1:
            pages = int(text)
            pages_flag = 0
        elif per_cal_flag == 1:
            per_cal = int(text)
            per_cal_flag = 0
        elif ad_cal_flag == 1:
            ad_cal = int(text)
            ad_cal_flag = 0
        elif material_flag == 1:
            material = text
            material_flag = 0
        elif series_flag == 1:
            series = int(text)
            series_flag = 0
        elif send_time_flag == 1:
            send_time = text
            send_time_flag = 0

        if 'کد کتاب' in text:
            code_flag = 1
        elif 'شابک' in text:
            isbn_flag = 1
        elif 'قطع' in text:
            size_flag = 1
        elif 'تعداد صفحه' in text:
            pages_flag = 1
        elif 'سال انتشار شمسی' in text:
            per_cal_flag = 1
        elif 'سال انتشار میلادی' in text:
            ad_cal_flag = 1
        elif 'نوع جلد' in text:
            material_flag = 1
        elif 'سری چاپ' in text:
            series_flag = 1
        elif 'زودترین زمان ارسال' in text:
            send_time_flag = 1

    return [code, isbn, size, pages, per_cal, ad_cal, material, series, send_time]

In [145]:
def get_summary(soup):
    summary = soup.select('.product-description')[0].text.strip()
    return summary

In [146]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [147]:
def get_book_detail(book_soup, site_index):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)
    book_author_presence = is_author_available(book_soup)

    [book_code, book_Isbn, book_size, book_pages, book_publication_per_date, book_publication_ad_date,
     book_cover_material, book_print_series, book_earliest_send_time] = get_book_attribute(book_soup)

    book_data = [site_index, int(book_code), book_Isbn, book_fa_title, book_en_title, book_price,
                 int(book_discount_percent),
                 book_score, book_publisher, book_author,
                 int(book_pages), int(book_publication_per_date), int(book_publication_ad_date), book_size,
                 book_cover_material,
                 int(book_print_series), book_earliest_send_time, book_author_presence]
    return book_data

In [148]:
def get_book_site_summary(book_soup, site_index):
    try:
        book_summary = get_summary(book_soup)
    except Exception:
        book_summary = None
        logging.exception("This book has no summary!")
    return [site_index, book_summary]

In [149]:
def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index, tag]]
    return book_tags_list

In [150]:
def get_site_awards(soup, site_index):
    awards_list = []
    awards = soup.select('book_soup, site_index')

    for award in awards:
        print(award.text)
        awards += [award.text]

In [151]:
def get_req_list(list, req_count):
    if len(list) >= req_count:
        request_list = list[:req_count].copy()
    else:
        request_list = list.copy()
    return request_list

In [152]:
def scrape(site_soup):
    try:
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")

In [153]:
def fast_scrape(link):
    try:
        site_soup = get_soup(link)
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                writers_data_list.extend(data[9])     #9th index is the writer column which is a list of writers
                publishers_data_list.append(data[8])  #8th column is dict of publisher
                data[8]=data[8]['id']                 #convert 8th column from dict to the publisher's id
                writers_list_of_dict=data[9]          #writes list which is a list of dictionary
                data.pop(9)                           #remove 9th column from data(9th column was writers)
                for w_id in writers_list_of_dict:
                     books_writers_data_list.append({'book_id':data[1],'writer_id':w_id['id']})
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")


<h1>Detailed Scraper</h1>

In [None]:
links = get_links()[:200] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']

page_response = []
books_data_list = []
site_tags_data_list = []
site_summary_data_list = []

site_index = 1
sleep_time = 0.5
max_threads = 20
book_count_request = 20  #number of requests per time

lock = threading.Lock()
book_urls = links.copy()

while len(book_urls):
    sleep(sleep_time)  #sleep so that the site does not ban us
    request_list = get_req_list(book_urls, book_count_request)  #list of book's urls we want to send request 
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        future_list = executor.map(get_response, request_list)
        for future in future_list:
            try:
                data = future
                page_response.append(data)
            except Exception as exc:
                continue
        for item in page_response:
            if item.status_code == 200:
                page_url = item.url
                if page_url in request_list:
                    page_soup = bs4.BeautifulSoup(item.content, 'html.parser')
                    scrape(page_soup)
                    book_urls.remove(page_url)


ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_32944\1156613770.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_32944\3952702039.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_32944\1156613770.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_32944\3952702039.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_32944\1156613770.py", line 3, in get_book

KeyboardInterrupt: 

<h1>Fast Scraper</h1>

In [175]:
links = get_links()[:200] #+ ['https://www.iranketab.ir/book/270-gone-with-the-wind']

books_data_list = []
site_tags_data_list = []
site_summary_data_list = []
writers_data_list=[]
publishers_data_list=[]
books_writers_data_list=[]
price_history_data_list=[]

site_index = 1
max_threads = 20

lock = threading.Lock()

with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
    executor.map(fast_scrape, links)


ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_32944\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_32944\1156613770.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_32944\3952702039.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_32944\1156613770.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_32

<h1>Check Completnes</h1>

In [None]:
if len(book_urls) == 0:
    print('All links scraped!')
else:
    print('Something wrong happened')

<h1>Make Dataframes</h1>

In [176]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_index', 'code', 'Isbn', 'fa_title', 'en_title', 'price', 'discount', 'score',
                                    'publisher_id', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series', 'earliest_send_time', 'presence'])
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,price,discount,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,earliest_send_time,presence
0,1,536,978-600-8812-16-6,کتاب دختری با کت آبی,Girl in the Blue Coat,155000,15,4.03,61,256,1402,2016,رقعی,شومیز,11,6 مهر,موجود
1,2,-1,978-9648014327,دفتر یادداشت ترکیبی,Notebook,175000,0,3.39,1773,-1,-1,-1,پالتویی,-1,-1,---,تمام شد ، اما میاریمش 😏
2,3,116421,978-6009313884,کتاب تانیا,Tanya,75000,20,3.62,1671,159,1396,1939,جیبی,شومیز,1,6 مهر,موجود
3,4,107977,978-9641721932,کتاب قاصدک ها در هوا ایستاده اند,Dandelions are standing in the air,69000,20,3.18,1122,123,1401,-1,رقعی,شومیز,1,6 مهر,موجود
4,5,45906,978-9647357593,کتاب بیست سال تکاپوی اسلامی شیعی در ایران,Twenty years of Shiite Islamic endeavor in Iran,55000,15,3.79,1373,588,1397,-1,وزیری,شومیز,1,5 مهر,موجود
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,196,40085,978-9648564952,کتاب مصاحبه,Interview,40000,15,3.98,85,96,1399,-1,رقعی,شومیز,4,4 مهر,موجود
211,197,34264,978-6008267355,کتاب جهان باکتری ها,The Surprising World of Bacteria with Max Axi...,35000,20,3.3,2193,32,1395,2010,وزیری,شومیز,2,4 مهر,موجود
212,198,15719,978-6229606506,کتاب زندگانی من و روزگار سخت,My Life and Hard Times,35000,15,3.54,1142,116,1398,1933,رقعی,شومیز,1,4 مهر,موجود
213,198,79410,978-6220108443,روزهای دشوار زندگی من,My Life and Hard Times,60000,0,3.01,33,130,1401,1933,رقعی,شومیز,1,4 مهر,موجود


In [167]:
file_path = "bookData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [168]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list, columns=['site_index', 'summary'])
tableOfSummaryData

Unnamed: 0,site_index,summary
0,1,دفتر يادداشت تركيبي پنگوئن:خط دار،بي خط،نقطه ا...
1,2,سیاست رضاشاه، اسلام زدایی، محو ارزش های شیعی و...
2,3,نمایش‌نامه «تانیا» داستان زندگی زنی جوان است ک...
3,4,شاعر گرانمایه جناب آقای رضا اسماعیلی که از چهر...
4,5,
...,...,...
195,194,هشدار!خب بله، این کتاب بسیار بسیار بسیاااااااا...
196,195,
197,196,با ابر دانشمند مکس آکسیوم، دیدار کنید او توانا...
198,197,در زمانی که بسیاری از پسران در بحران هستند ، ی...


In [169]:
file_path = "BookSummaryData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [170]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list, columns=['site_index', 'tag'])
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,دفتر یادداشت
1,1,دفتر یادداشت نقطه ای
2,1,دفتر یادداشت خط دار
3,1,دفتر یادداشت بی خط
4,2,ادبیات واقع گرایانه
...,...,...
1053,198,داستان کمدی (طنز)
1054,198,خود زندگی نامه
1055,198,ادبیات واقع گرایانه
1056,198,دهه 1930 میلادی


In [171]:
file_path = "bookTagsData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [172]:
table_of_publisher=pd.DataFrame(publishers_data_list).drop_duplicates(subset=['id','name','link'])
table_of_publisher.to_csv('./publisher.csv',index=False,encoding='utf-8')
table_of_publisher

Unnamed: 0,id,name,link
0,1773,کارگاه فیلم و گرافیک سپاس,/publisher/1773-%da%a9%d8%a7%d8%b1%da%af%d8%a7...
1,1373,مرکز اسناد انقلاب,/publisher/1373-%d9%85%d8%b1%da%a9%d8%b2-%d8%a...
2,1671,پژواک فرزان,/publisher/1671-%d9%be%da%98%d9%88%d8%a7%da%a9...
3,90,اطلاعات,/publisher/90-%d8%a7%d8%b7%d9%84%d8%a7%d8%b9%d...
4,81,علمی و فرهنگی,/publisher/81-%d8%b9%d9%84%d9%85%db%8c-%d9%88-...
...,...,...,...
207,48,ققنوس,/publisher/48-%d9%82%d9%82%d9%86%d9%88%d8%b3
209,85,نیلا,/publisher/85-%d9%86%db%8c%d9%84%d8%a7
210,2193,آوای روزان,/publisher/2193-%d8%a2%d9%88%d8%a7%db%8c-%d8%b...
211,53,کتاب کوله پشتی,/publisher/53-%da%a9%d8%aa%d8%a7%d8%a8-%da%a9%...


In [173]:
table_of_writer_page=pd.DataFrame(writers_data_list).drop_duplicates(subset=['id','name','link'])
table_of_writer_page.to_csv('./writer_page.csv',index=False,encoding='utf-8')
table_of_writer_page

Unnamed: 0,id,name,link
0,26087,روح الله حسینیان,/profile/26087-%d8%b1%d9%88%d8%ad-%d8%a7%d9%84...
1,68029,آلکسی آربوزوف,/profile/68029-aleksei-arbuzov
2,23928,رضا اسماعیلی,/profile/23928-%d8%b1%d8%b6%d8%a7-%d8%a7%d8%b3...
3,6220,مارتین وادل,/profile/6220-martin-waddell
4,704,مونیکا هسی,/profile/704-monica-hesse
...,...,...,...
202,9801,بی جی نواک,/profile/9801-b-j-novak
203,22975,محمد رحمانیان,/profile/22975-%d9%85%d8%ad%d9%85%d8%af-%d8%b1...
204,19773,انیسکا بسکاپ,/profile/19773-agnieszka-biskup
205,22621,مایکل سی ریچرت,/profile/22621-michael-c-reichert


In [174]:
table_of_writer=pd.DataFrame(books_writers_data_list).drop_duplicates(subset=['book_id','writer_id'])
table_of_writer.to_csv('./writer.csv',index=False,encoding='utf-8')
table_of_writer

Unnamed: 0,book_id,writer_id
0,45906,26087
1,116421,68029
2,41857,23928
3,49138,6220
4,536,704
...,...,...
204,34264,19773
205,39335,22621
206,15719,9113
207,79410,9113
