In [2]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import logging
import concurrent.futures
import threading
import math
from time import sleep

In [3]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [4]:
def get_response(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    return response

In [5]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [6]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [7]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [8]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))

In [9]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price  

In [10]:
def get_score(soup):
    soup = soup.find('div', {'class':'col-md-7'}).find('li', {'class':'pull-left'}).find('div', {'class':'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating
    

In [11]:
def get_publisher(soup):
    try:
        publisher = soup.select('.prodoct-attribute-items:nth-child(1) a .prodoct-attribute-item')[0].text.strip()
        publisher_href = soup.find('div', {'class': 'row clearfix'}).find('a')['href']
    except Exception:
        publisher = None
        logging.exception("This book has no publisher!")
    return publisher

In [12]:
def get_author(soup):
    try:
        author = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items a .prodoct-attribute-item')[0].text.strip()
    except Exception:
        author = None
        logging.exception("This book has no author!")    
    return author

In [13]:
def get_precense(soup):
    try:
          existence=soup.select('.pull-left+ li span')[0].text
    except:
         existence = None
         logging.exception("This book has no author!")    
    return existence
    

In [14]:
def get_book_attribute(soup):
    rows = soup.find('table', {'class': 'product-table'}).findAll('td')
    code = -1
    isbn = -1
    size = -1
    pages = -1
    per_cal = -1
    ad_cal = -1
    material = -1
    series = -1
    send_time = -1
    code_flag = 0
    isbn_flag = 0
    size_flag = 0
    pages_flag = 0
    per_cal_flag = 0
    ad_cal_flag = 0
    material_flag = 0
    series_flag = 0
    send_time_flag = 0

    for row in rows:
        text = row.text.strip()
        if code_flag == 1:
            code = int(text)
            code_flag = 0
        elif isbn_flag == 1:
            isbn = text
            isbn = re.sub('[^0-9-]', '', isbn)
            isbn_flag = 0
        elif size_flag == 1:
            size = text
            size_flag = 0
        elif pages_flag == 1:
            pages = int(text)
            pages_flag = 0
        elif per_cal_flag == 1:
            per_cal = int(text)
            per_cal_flag = 0
        elif ad_cal_flag == 1:
            ad_cal = int(text)
            ad_cal_flag = 0
        elif material_flag == 1:
            material = text
            material_flag = 0
        elif series_flag == 1:
            series = int(text)
            series_flag = 0
        elif send_time_flag == 1:
            send_time = text
            send_time_flag = 0

        if 'کد کتاب' in text:
            code_flag = 1
        elif 'شابک' in text:
            isbn_flag = 1
        elif 'قطع' in text:
            size_flag = 1
        elif 'تعداد صفحه' in text:
            pages_flag = 1
        elif 'سال انتشار شمسی' in text:
            per_cal_flag = 1
        elif 'سال انتشار میلادی' in text:
            ad_cal_flag = 1
        elif 'نوع جلد' in text:
            material_flag = 1
        elif 'سری چاپ' in text:
            series_flag = 1
        elif 'زودترین زمان ارسال' in text:
            send_time_flag = 1
        
    return [code, isbn, size, pages, per_cal, ad_cal, material, series, send_time]

In [15]:
def get_summary(soup):
    summary = soup.select('.product-description')[0].text.strip()
    return summary

In [16]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [17]:
def get_book_detail(book_soup, site_index):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)
    book_precense=get_precense(book_soup)

    [book_code, book_Isbn, book_size, book_pages, book_publication_per_date, book_publication_ad_date,
     book_cover_material, book_print_series, book_earliest_send_time] = get_book_attribute(book_soup)

    book_data = [site_index, int(book_code), book_Isbn, book_fa_title, book_en_title, book_price, int(book_discount_percent),
                 book_score, book_publisher, book_author,
                 int(book_pages), int(book_publication_per_date), int(book_publication_ad_date), book_size, book_cover_material,
                 int(book_print_series), book_earliest_send_time,book_precense]
    return book_data

In [18]:
 def get_book_site_summary(book_soup, site_index):
    try:
        book_summary = get_summary(book_soup)
    except Exception:
        book_summary = None
        logging.exception("This book has no summary!")
    return [site_index, book_summary]

In [19]:
 def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index,tag]]
    return book_tags_list

In [20]:
def get_site_awards(soup, site_index):
    awards_list = []
    awards = soup.select('book_soup, site_index')
    
    for award in awards:
        print(award.text)
        awards += [award.text]

In [21]:
def get_req_list(list, req_count):
    if len(list) >= req_count:
        request_list = list[:req_count].copy()
    else:
        request_list = list.copy()
    return request_list

In [None]:
# links = get_links()
# site_index = 1
# books_data_list = []
# site_summary_data_list = []
# site_tags_data_list = []
# lock = threading.Lock()
# 
# def process_link(link):
#     try:
#         site_soup = get_soup(link)
#         with lock:
#             global site_index
#             site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
#             site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
#             site_page_books = site_soup.select('.clearfix .clearfix .row')
#             for book_index in range(0, len(site_page_books), 2):
#                 data = get_book_detail(site_page_books[book_index], site_index)
#                 books_data_list.append(data)
#             site_index += 1
#     except Exception:
#         logging.exception("An error occurred")
# 
# with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
#     executor.map(process_link, links)


In [22]:
links = get_links()[:10] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']
site_index = 1
books_data_list = []
site_summary_data_list = []
site_tags_data_list = []
lock = threading.Lock()


def process_link(site_soup):
    try:
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")


book_urls = links.copy()
page_response = []
book_count_request = 20  #number of requests per time
sleep_time = 1
while len(book_urls):
    sleep(sleep_time)  #sleep so that the site does not bann us
    request_list = get_req_list(book_urls, book_count_request)  #list of book's urls we want to send request 
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        future_list = executor.map(get_response, request_list)
        for future in future_list:
            try:
                data = future
                page_response.append(data)
            except Exception as exc:
                continue
        for item in page_response:
            if item.status_code == 200:
                page_url = item.url
                if page_url in request_list:
                    page_soup = bs4.BeautifulSoup(item.content, 'html.parser')
                    process_link(page_soup)
                    book_urls.remove(page_url)
   

11


ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_5524\2869572784.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_5524\1354406443.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_5524\2869572784.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_5524\1354406443.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_5524\2869572784.py", line 3, in get_book

In [23]:
if len(book_urls) == 0:
    print('All links scraped!')
else:
    print('Something wrong happened')

All links scraped!


In [25]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_index', 'code', 'Isbn', 'fa_title', 'en_title', 'price', 'discount', 'score',
                                    'publisher', 'author', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series', 'earliest_send_time','presence'])
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,price,discount,score,publisher,author,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,earliest_send_time,presence
0,1,70554,978-9647553261,کتاب کلیات پزشکی سنتی چینی و طب سوزنی,Pezeshki Sonnati,300000,15,3.87,المعی,حسن رضوانی,672,1402,-1,وزیری,شومیز,4,4 مهر,موجود
1,2,51563,978-9641940289,کتاب بانوان عمارت میسالونگی,The Ladies of Missalonghi,30000,15,3.17,روشنگران و مطالعات زنان,کالین مک کالو,184,1388,1987,رقعی,شومیز,1,4 مهر,موجود
2,3,49138,978-9644458071,کتاب گوزن شاخدار فایده اش چیه؟,What Use Is A Moose?,70000,15,3.58,علمی و فرهنگی,مارتین وادل,32,1400,1996,وزیری,شومیز,5,4 مهر,موجود
3,4,116421,978-6009313884,کتاب تانیا,Tanya,75000,20,3.62,پژواک فرزان,آلکسی آربوزوف,159,1396,1939,جیبی,شومیز,1,6 مهر,موجود
4,5,76016,978-9642683864,کتاب دانه های روشنایی,Flakes of light,9000,30,3.67,اشک,عباس مهرپویا,32,1388,-1,خشتی,شومیز,1,4 مهر,موجود
5,6,73891,978-9640669687,کتاب رازهای ناگفته ی بازاریابی,Marketing,20000,15,3.67,نیما عربشاهی,جو ویتالی,96,1384,-1,خشتی,شومیز,1,6 مهر,موجود
6,7,41857,978-9644239441,کتاب سلام بر عاشورا,Salam bar Ashoora,5000,15,3.63,اطلاعات,رضا اسماعیلی,94,1394,-1,رقعی,شومیز,2,6 مهر,موجود
7,8,107977,978-9641721932,کتاب قاصدک ها در هوا ایستاده اند,Dandelions are standing in the air,69000,20,3.18,دنیای نو,مهدی حسینی (مهد),123,1401,-1,رقعی,شومیز,1,6 مهر,موجود
8,9,27986,978-9643911140,کتاب سقف تالار آیینه عمارت نارنجستان شیراز,Saghf-e Talar-e Ayieneh,1600,0,3.35,کانون پرورش فکری کودکان و نوجوان,هادی سیف,28,1390,-1,رحلی,شومیز,2,---,به زودی 🙄
9,10,64447,978-6229736944,کتاب فناوری در سال 2021,"The Year in Tech, 2021",41500,0,3.01,راه پرداخت,هاروارد بیزینس ریویو,116,1399,2020,رقعی,شومیز,1,---,تمام شد ، اما میاریمش 😏


In [None]:
file_path = "bookData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [26]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list, columns=['site_index', 'summary'])
tableOfSummaryData

Unnamed: 0,site_index,summary
0,1,طب چینی یا سوزنی یکی از دو رشته درمانی طبی دنی...
1,2,
2,3,
3,4,نمایش‌نامه «تانیا» داستان زندگی زنی جوان است ک...
4,5,
5,6,این کتابچه متضمن چهل و پنج راهکار و نکته در با...
6,7,شاعر گرانمایه جناب آقای رضا اسماعیلی که از چهر...
7,8,
8,9,"کتاب حاضر، از مجموعه ی ""چرا ندیدیم؟""، معرفی دق..."
9,10,امروزه هنگامی که کســب‌وکارها بــه فناوری فکر ...


In [None]:
file_path = "BookSummaryData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [27]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list, columns=['site_index', 'tag'])
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,پزشکی
1,1,ادبیات ایران
2,1,کتاب مصور
3,1,بهداشت
4,2,ادبیات استرالیا
...,...,...
66,11,فهرست برترین رمان های تاریخی
67,11,برترین آثار داستانی با شخصیت اصلی زن
68,11,برترین آثار تبدیل شده به فیلم و سریال
69,11,فهرست برترین رمان های عاشقانه


In [None]:
file_path = "bookTagsData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')