In [1]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import logging
import concurrent.futures
import threading
from time import sleep
from datetime import datetime

In [2]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [3]:
def get_response(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    return response

In [4]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [5]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [6]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [7]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))

In [8]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price

In [9]:
def get_score(soup):
    soup = soup.find('div', {'class': 'col-md-7'}).find('li', {'class': 'pull-left'}).find('div',
                                                                                           {'class': 'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating


In [10]:
def get_publisher(soup):
    try:
        publisher_tag = soup.select('div.prodoct-attribute-items:nth-child(1) > a')[0]
        publisher_link=publisher_tag.get('href')
        publisher_id=publisher_link.split('/')[2].split('-')[0]
        publisher_name=publisher_tag.text.strip()
    except Exception:
        publisher_link=-1
        publisher_id=-1
        publisher_name=-1
    return {'id':publisher_id,'name':publisher_name,'link':publisher_link}

In [11]:
def get_author(soup):
    authors_list=[]          
    try:
        authors_a_tag = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items > a')
        if(len(authors_a_tag))==0:
                return authors_list
        for author_a_tag in authors_a_tag:
                author_link=author_a_tag.get('href')
                author_id=author_link.split('/')[2].split('-')[0]
                author_name=author_a_tag.text.strip()
                authors_list.append({'id':author_id,'name':author_name,'link':author_link})
    except Exception:
         authors_list=[]         
    return authors_list

In [12]:
def is_author_available(soup):
    try:
        existence = soup.select('.pull-left+ li span')[0].text
    except:
        existence = None
        logging.exception("This book has no author!")
    return existence


In [13]:
def get_book_attribute(soup):
    rows = soup.find('table', {'class': 'product-table'}).findAll('td')
    code = -1
    isbn = -1
    size = -1
    pages = -1
    per_cal = -1
    ad_cal = -1
    material = -1
    series = -1
    send_time = -1
    language = 'فارسی'
    translators=[]
    paper_type=-1
    code_flag = 0
    isbn_flag = 0
    size_flag = 0
    pages_flag = 0
    per_cal_flag = 0
    ad_cal_flag = 0
    material_flag = 0
    language_flag = 0
    series_flag = 0
    send_time_flag = 0
    translators_flag=0
    paper_type_flag=0
    
    for row in rows:
        text = row.text.strip()
        if code_flag == 1:
            code = int(text)
            code_flag = 0
        elif isbn_flag == 1:
            isbn = text
            isbn = re.sub('[^0-9-]', '', isbn)
            isbn_flag = 0
        elif size_flag == 1:
            size = text
            size_flag = 0
        elif pages_flag == 1:
            pages = int(text)
            pages_flag = 0
        elif per_cal_flag == 1:
            per_cal = int(text)
            per_cal_flag = 0
        elif ad_cal_flag == 1:
            ad_cal = int(text)
            ad_cal_flag = 0
        elif material_flag == 1:
            material = text
            material_flag = 0
        elif language_flag == 1:
            language = text
            language_flag = 0
        elif series_flag == 1:
            series = int(text)
            series_flag = 0
        elif send_time_flag == 1:
            send_time = text
            send_time_flag = 0
        elif paper_type_flag == 1:
            paper_type = text
            paper_type_flag = 0
        elif translators_flag == 1:
            translators_a_tag = row.select('a')
            translators_flag=0
            for a_tag in translators_a_tag:
                translators.append({'id':a_tag.get('href').split('/')[2].split('-')[0],'name':a_tag.text.strip(),'link':a_tag.get('href')})


        if 'کد کتاب' in text:
            code_flag = 1
        elif 'شابک' in text:
            isbn_flag = 1
        elif 'قطع' in text:
            size_flag = 1
        elif 'تعداد صفحه' in text:
            pages_flag = 1
        elif 'سال انتشار شمسی' in text:
            per_cal_flag = 1
        elif 'سال انتشار میلادی' in text:
            ad_cal_flag = 1
        elif 'نوع جلد' in text:
            material_flag = 1
        elif 'زبان کتاب' in text:
            language_flag = 1
        elif 'سری چاپ' in text:
            series_flag = 1
        elif 'زودترین زمان ارسال' in text:
            send_time_flag = 1
        elif 'مترجم' in text:
            translators_flag = 1
        elif   'نوع کاغذ' in text:
            paper_type_flag=1


    return [code, isbn, size, pages, per_cal, ad_cal, material, series,language, send_time,translators,paper_type]

In [14]:
def get_summary(soup):
    summary = soup.select('.product-description')[0].text.strip()
    return summary

In [15]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [16]:
def get_book_detail(book_soup, site_index):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)
    book_author_presence = is_author_available(book_soup)

    [book_code, book_Isbn, book_size, book_pages, book_publication_per_date, book_publication_ad_date,
     book_cover_material, book_print_series,book_language, book_earliest_send_time,book_translators,paper_type] = get_book_attribute(book_soup)
    price_history_data_list.append({'book_id':int(book_code),'price':book_price,'discount':int(book_discount_percent),'date':str(datetime.today())})
  
    publishers_data_list.append(book_publisher)  #8th column is dict of publisher
    

    #writer
    writer_page_data_list.extend(book_author)     #9th index is the writer column which is a list of writers
    writers_list_of_dict=book_author.copy()        #writes list which is a list of dictionary
    if len(writers_list_of_dict)!=0:
        for w_id in writers_list_of_dict:
            books_writers_data_list.append({'book_id':int(book_code),'writer_id':w_id['id']})

    #translator

    translator_page_data_list.extend(book_translators)
    translators_list_of_dict=book_translators.copy()        #writes list which is a list of dictionary
    if len(translators_list_of_dict)!=0:
        for t_id in translators_list_of_dict:
            books_translators_data_list.append({'book_id':int(book_code),'translator_id':t_id['id']})
    book_publisher=book_publisher['id']               #convert 8th column from dict to the publisher's id
    book_data = [site_index, int(book_code), book_Isbn, book_fa_title, book_en_title,
                 book_score, book_publisher,
                 int(book_pages), int(book_publication_per_date), int(book_publication_ad_date), book_size,
                 book_cover_material,
                  int(book_print_series),book_language, book_earliest_send_time, book_author_presence,paper_type]
    return book_data

In [17]:
def get_book_site_veneration(soup, site_index):
    div = soup.find('div', attrs = {'class':'col-md-6 col-xs-12'})
    ven_lst = list()
    try:
        english_bars = div.find_all('div', attrs = {'class':'english-bar ltr'})
        persian_bars = div.find_all('div', attrs = {'class':'persian-bar'})
        prise_writers = div.find_all('div', attrs = {'class':'prise-writer ltr'})
        
        n = len(english_bars)
        for i in range(n):
            english_quote = english_bars[i].text.strip()
            persian_quote = persian_bars[i].text.strip()
            prise_writer = prise_writers[i].text.strip()

            ven_dic = {'site_index':site_index,
                       'English_Quote': english_quote,
                       'Persian_Quote': persian_quote,
                       'Prise_Writer': prise_writer}
            ven_lst.append(ven_dic)
        return(ven_lst)
    except:
        return ven_lst

In [18]:
def get_book_site_summary(book_soup, site_index):
    try:
        book_summary = get_summary(book_soup)
    except Exception:
        book_summary = None
        logging.exception("This book has no summary!")
    return [site_index, book_summary]

In [19]:
def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index, tag]]
    return book_tags_list

In [20]:
def get_book_site_awards(book_soup, site_index):
    awards=[]
    n=len(book_soup.select('.product-features h4'))
    for i in range(0,n):
        awards.append({'site_index':site_index,'award':book_soup.select('.product-features h4')[i].text})
    return awards

In [21]:
def get_req_list(list, req_count):
    if len(list) >= req_count:
        request_list = list[:req_count].copy()
    else:
        request_list = list.copy()
    return request_list

In [22]:
def scrape(site_soup):
    try:
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")

In [23]:
def fast_scrape(link):
    try:
        site_soup = get_soup(link)
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            book_veneration_data_list.extend(get_book_site_veneration(site_soup,site_index))
            site_award_data_list.extend(get_book_site_awards(site_soup,site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                # price_history_data_list.append({'book_id':data[1],'price':data[5],'discount':data[6],'date':str(datetime.today())})
                # writer_page_data_list.extend(data[9])     #9th index is the writer column which is a list of writers
                # publishers_data_list.append(data[8])  #8th column is dict of publisher
                # data[8]=data[8]['id']                 #convert 8th column from dict to the publisher's id
                # writers_list_of_dict=data[9]          #writes list which is a list of dictionary
                # data.pop(9)                           #remove 9th column from data(9th column was writers)
                # data.pop(5)                           #remove 5th column which is price column
                # data.pop(5)                           #remove 5th column which is discount after deleting price
                # for w_id in writers_list_of_dict:
                #      books_writers_data_list.append({'book_id':data[1],'writer_id':w_id['id']})
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")


<h1>Detailed Scraper</h1>

In [24]:
# links = get_links()[:200] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']

# page_response = []
# books_data_list = []
# site_tags_data_list = []
# site_summary_data_list = []

# site_index = 1
# sleep_time = 0.5
# max_threads = 20
# book_count_request = 20  #number of requests per time

# lock = threading.Lock()
# book_urls = links.copy()

# while len(book_urls):
#     sleep(sleep_time)  #sleep so that the site does not ban us
#     request_list = get_req_list(book_urls, book_count_request)  #list of book's urls we want to send request 
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
#         future_list = executor.map(get_response, request_list)
#         for future in future_list:
#             try:
#                 data = future
#                 page_response.append(data)
#             except Exception as exc:
#                 continue
#         for item in page_response:
#             if item.status_code == 200:
#                 page_url = item.url
#                 if page_url in request_list:
#                     page_soup = bs4.BeautifulSoup(item.content, 'html.parser')
#                     scrape(page_soup)
#                     book_urls.remove(page_url)


<h1>Fast Scraper</h1>

In [25]:
links = get_links()[:100]
#+ ['https://www.iranketab.ir/book/270-gone-with-the-wind']

books_data_list = []
site_tags_data_list = []
site_summary_data_list = []
site_award_data_list=[]

writer_page_data_list=[]
translator_page_data_list=[]
publishers_data_list=[]
price_history_data_list=[]
book_veneration_data_list=[]
#middle tables
books_writers_data_list=[]
books_translators_data_list=[]


site_index = 1
max_threads = 20

lock = threading.Lock()

with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
    executor.map(fast_scrape, links)


ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_20976\1156613770.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_20976\3952702039.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_20976\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_20976\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))


<h1>Check Completnes</h1>

In [26]:
# if len(book_urls) == 0:
#     print('All links scraped!')
# else:
#     print('Something wrong happened')

<h1>Make Dataframes</h1>

In [27]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_index', 'code', 'Isbn', 'fa_title', 'en_title', 'score',
                                    'publisher_id', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series','language' ,'earliest_send_time', 'presence','paper_type'])
tableOfData=tableOfData[tableOfData['code']!=-1]

In [28]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence,paper_type
0,1,14109,978-6004054591,کتاب نامه هایی به اولگا,Letters to Olga,3.37,31,402,1402,1983,رقعی,جلد سخت,2,فارسی,5 مهر,موجود,-1
1,2,19941,978-6006935157,کتاب نوربرت خرگردن,Norberto Nucagorda,3.86,11,56,1398,1987,رقعی,شومیز,2,فارسی,8 مهر,موجود,-1
2,3,87800,978-6222086718,کتاب گزیده طنز شهاب ترشیزی,Shahab Torshizi,3.33,1327,85,1401,-1,رقعی,شومیز,1,فارسی,5 مهر,موجود,-1
3,4,105323,978-6005861280,کتاب من یک احمق هستم شما چطور…؟!,"I'm an idiot, how about you...?!",3.04,1259,144,1402,-1,رقعی,شومیز,3,فارسی,8 مهر,موجود,-1
4,5,25671,978-9643124694,کتاب مدیریت منابع انسانی,Human resources management,3.8,66,496,1398,-1,وزیری,شومیز,13,فارسی,---,به زودی 🙄,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,95,64191,978-6227592146,کتاب شیوه بازاریابی محتوایی یک صفحه ای,The One-Page Content Marketing Blueprint,3.42,1630,260,1400,2019,رقعی,شومیز,1,فارسی,---,تمام شد ، اما میاریمش 😏,-1
107,96,16402,978-9642249930,کتاب مکانیک کوانتومی,Quantum Mechanics,3.21,9,84,1398,2017,رقعی,شومیز,4,فارسی,5 مهر,موجود,-1
108,97,101751,978-6226374446,کتاب رویا و رویا,Dream and dream,3.92,1210,305,1401,-1,رقعی,شومیز,1,فارسی,5 مهر,موجود,-1
109,98,89058,978-9648021974,کتاب هنر تزئین شیرینی و شکلات,Decoration,3.32,1336,60,1390,-1,خشتی,شومیز,2,فارسی,8 مهر,موجود,-1


In [29]:
table_of_cover_type=pd.DataFrame(tableOfData['cover_material'].drop_duplicates())
table_of_cover_type=table_of_cover_type[table_of_cover_type['cover_material']!=-1].reset_index(drop=True)
table_of_cover_type.to_csv('./cover_type.csv',encoding='utf-8')
table_of_cover_type

Unnamed: 0,cover_material
0,جلد سخت
1,شومیز
2,سلفونی
3,جلد نرم
4,زرکوب


In [30]:
table_of_format=pd.DataFrame(tableOfData['size'].drop_duplicates())
table_of_format=table_of_format[table_of_format['size']!=-1].reset_index(drop=True)
table_of_format.to_csv('./format.csv',encoding='utf-8')
table_of_format

Unnamed: 0,size
0,رقعی
1,وزیری
2,جیبی
3,رحلی
4,خشتی
5,پالتویی


In [31]:
def convert_size_to_int(size):
    try:
     return table_of_format.index[table_of_format['size']==str(size)].to_list()[0]
    except:
        return -1

tableOfData['size']=tableOfData['size'].apply(convert_size_to_int)

In [32]:
def convert_cover_type_to_int(material):
    try:
     return table_of_cover_type.index[table_of_cover_type['cover_material']==str(material)].to_list()[0]
    except:
        return -1

tableOfData['cover_material']=tableOfData['cover_material'].apply(convert_cover_type_to_int)

In [33]:

tableOfData.to_csv("bookData.csv", index=False, encoding='utf-8')

In [34]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence,paper_type
0,1,14109,978-6004054591,کتاب نامه هایی به اولگا,Letters to Olga,3.37,31,402,1402,1983,0,0,2,فارسی,5 مهر,موجود,-1
1,2,19941,978-6006935157,کتاب نوربرت خرگردن,Norberto Nucagorda,3.86,11,56,1398,1987,0,1,2,فارسی,8 مهر,موجود,-1
2,3,87800,978-6222086718,کتاب گزیده طنز شهاب ترشیزی,Shahab Torshizi,3.33,1327,85,1401,-1,0,1,1,فارسی,5 مهر,موجود,-1
3,4,105323,978-6005861280,کتاب من یک احمق هستم شما چطور…؟!,"I'm an idiot, how about you...?!",3.04,1259,144,1402,-1,0,1,3,فارسی,8 مهر,موجود,-1
4,5,25671,978-9643124694,کتاب مدیریت منابع انسانی,Human resources management,3.8,66,496,1398,-1,1,1,13,فارسی,---,به زودی 🙄,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,95,64191,978-6227592146,کتاب شیوه بازاریابی محتوایی یک صفحه ای,The One-Page Content Marketing Blueprint,3.42,1630,260,1400,2019,0,1,1,فارسی,---,تمام شد ، اما میاریمش 😏,-1
107,96,16402,978-9642249930,کتاب مکانیک کوانتومی,Quantum Mechanics,3.21,9,84,1398,2017,0,1,4,فارسی,5 مهر,موجود,-1
108,97,101751,978-6226374446,کتاب رویا و رویا,Dream and dream,3.92,1210,305,1401,-1,0,1,1,فارسی,5 مهر,موجود,-1
109,98,89058,978-9648021974,کتاب هنر تزئین شیرینی و شکلات,Decoration,3.32,1336,60,1390,-1,4,1,2,فارسی,8 مهر,موجود,-1


In [35]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list, columns=['site_index', 'summary']).drop_duplicates(subset=['site_index','summary'])
tableOfSummaryData=tableOfSummaryData[tableOfSummaryData.notnull().all(axis=1)]
tableOfSummaryData

Unnamed: 0,site_index,summary
0,1,این که من اوقاتم را در این جا و دقیقا در این ج...
1,2,در دشت های پهناور آفریقا کرگدنی به نام نوربرت ...
3,4,این کتاب به بیان مشکلات گوناگون جامعه ما می پر...
4,5,توجه به نیروی انسانی طی سال های اخیر بخش مهمی ...
5,6,ناپلئون، بیل گیت، جورج دابلیو بوش، اسامه بن‌لا...
...,...,...
92,92,این کتاب توضیح می دهد که چگونه فرهنگ خود را با...
94,94,هیچ رابطه‌ی زناشویی و به طور کل هیچ ارتباطی هی...
95,95,بازاریابی محتوا در واقع نوعی از بازاریابی درون...
96,96,مکانیک کوانتومی (به انگلیسی: Quantum mechanics...


In [36]:
tableOfSummaryData.to_csv( "BookSummaryData.csv", index=False, encoding='utf-8')

In [37]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list, columns=['site_index', 'tag'])\
    .drop_duplicates(subset=['site_index','tag'])
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,دهه 1980 میلادی
1,1,ادبیات چک
2,1,سیاسی
3,1,تاریخی
4,1,جایزه ی فرانتس کافکا
...,...,...
493,99,ادبیات کلاسیک
494,99,دهه 1950 میلادی
495,99,فلسفی
496,99,دینی و مذهبی


In [38]:
table_of_tag=pd.DataFrame(tableOfSiteTagsData['tag']).drop_duplicates(subset=['tag']).reset_index(drop=True)
table_of_tag.to_csv('./tag.csv',encoding='utf-8')
table_of_tag

Unnamed: 0,tag
0,دهه 1980 میلادی
1,ادبیات چک
2,سیاسی
3,تاریخی
4,جایزه ی فرانتس کافکا
...,...
141,ادبیات هند
142,بازاریابی
143,تجربه و هنر زندگی
144,دهه 1950 میلادی


In [39]:
def convert_tag_to_int(tag):
    try:
     return table_of_tag.index[table_of_tag['tag']==str(tag)].to_list()[0]
    except:
        return -1

tableOfSiteTagsData['tag']=tableOfSiteTagsData['tag'].apply(convert_tag_to_int)

In [40]:
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,0
1,1,1
2,1,2
3,1,3
4,1,4
...,...,...
493,99,53
494,99,144
495,99,22
496,99,37


In [41]:
tableOfSiteTagsData.to_csv('bookTagsData.csv',index=False,encoding='utf-8')

In [42]:
table_of_publisher=pd.DataFrame(publishers_data_list).drop_duplicates(subset=['id','name','link'])
table_of_publisher.to_csv('./publisher.csv',index=False,encoding='utf-8')
table_of_publisher

Unnamed: 0,id,name,link
0,31,نشر ثالث,/publisher/31-%d9%86%d8%b4%d8%b1-%d8%ab%d8%a7%...
1,11,افق,/publisher/11-%d8%a7%d9%81%d9%82
2,1327,نیستان,/publisher/1327-%d9%86%db%8c%d8%b3%d8%aa%d8%a7...
3,1259,چابک اندیش,/publisher/1259-%da%86%d8%a7%d8%a8%da%a9-%d8%a...
4,66,نشر نی,/publisher/66-%d9%86%d8%b4%d8%b1-%d9%86%db%8c
...,...,...,...
104,1391,بدیهه,/publisher/1391-%d8%a8%d8%af%db%8c%d9%87%d9%87
105,2368,ما و شما,/publisher/2368-%d9%85%d8%a7-%d9%88-%d8%b4%d9%...
106,1630,ادیبان روز,/publisher/1630-%d8%a7%d8%af%db%8c%d8%a8%d8%a7...
108,1210,افکار,/publisher/1210-%d8%a7%d9%81%da%a9%d8%a7%d8%b1


In [43]:
books_writers_data_list=list(filter(bool, books_writers_data_list))
table_of_writer=pd.DataFrame(books_writers_data_list).drop_duplicates(subset=['book_id','writer_id'])
#drop rows which both book_id and writer_id is -1
table_of_writer=table_of_writer[(table_of_writer['book_id']!=-1) & (table_of_writer['writer_id']!=-1)]
table_of_writer.to_csv('./writer.csv',index=False,encoding='utf-8')
table_of_writer

Unnamed: 0,book_id,writer_id
0,14109,7713
1,19941,11484
2,87800,4690
3,87800,5185
4,105323,17747
...,...,...
108,64191,35166
109,16402,9541
110,101751,7784
111,89058,48125


In [44]:
table_of_writer_page=pd.DataFrame(writer_page_data_list).drop_duplicates(subset=['id','name','link'])
table_of_writer_page.to_csv('./writer_page.csv',index=False,encoding='utf-8')
table_of_writer_page

Unnamed: 0,id,name,link
0,7713,واتسلاف هاول,/profile/7713-v%c3%a1clav-havel
1,11484,میشائیل انده,/profile/11484-michael-ende
2,4690,ابوالفضل زرویی نصرآباد,/profile/4690-%d8%a7%d8%a8%d9%88%d8%a7%d9%84%d...
3,5185,نسیم عرب امیری,/profile/5185-nasim-arab-amiri
4,17747,حسن چابک,/profile/17747-hasan-chabok
...,...,...,...
108,35166,پرافول شارما,/profile/35166-prafull-sharma
109,9541,جیم الخلیلی,/profile/9541-jim-al-khalili
110,7784,اصغر الهی,/profile/7784-asghar-elahi
111,48125,ایرینا ویکتورنا استپانوا,/profile/48125-irina-viktorovna-stepanova


In [45]:
table_of_translator=pd.DataFrame(books_translators_data_list).drop_duplicates(subset=['book_id','translator_id'])
table_of_translator=table_of_translator[(table_of_translator['book_id']!=-1) & (table_of_translator['translator_id']!=-1)]
table_of_translator.to_csv('./translator.csv',index=False,encoding='utf-8')
table_of_translator


Unnamed: 0,book_id,translator_id
0,14109,569
1,19941,2634
2,76229,32803
3,25277,14441
4,36714,4702
5,67445,24867
6,105144,30543
7,115076,33130
8,25342,43843
9,14025,5746


In [46]:
table_of_translator_page=pd.DataFrame(translator_page_data_list).drop_duplicates(subset=['id','name','link'])
table_of_translator_page.to_csv('translator_page.csv',index=False,encoding='utf-8')
table_of_translator_page

Unnamed: 0,id,name,link
0,569,فروغ پوریاوری,/profile/569-fourough-pouryavari
1,2634,کتایون سلطانی,/profile/2634-katayoun-soltani
2,32803,ابراهیم توبه یانی,/profile/32803-%d8%a7%d8%a8%d8%b1%d8%a7%d9%87%...
3,14441,مهرداد تویسرکانی,/profile/14441-mehrdad-tuyserkani
4,4702,مرتضی ثاقب فر,/profile/4702-morteza-sagheb-far
5,24867,الهه حجازی,/profile/24867-%d8%a7%d9%84%d9%87%d9%87-%d8%ad...
6,30543,فاطمه رحیمی,/profile/30543-%d9%81%d8%a7%d8%b7%d9%85%d9%87-...
7,33130,فاریا جنیدی,/profile/33130-%d9%81%d8%a7%d8%b1%db%8c%d8%a7-...
8,43843,مصطفی امیری,/profile/43843-mostafa-amiri
9,5746,تورج یاراحمدی,/profile/5746-%d8%aa%d9%88%d8%b1%d8%ac-%db%8c%...


In [47]:
table_of_price_history=pd.DataFrame(price_history_data_list).drop_duplicates(subset=['book_id','price','discount','date'])
table_of_price_history=table_of_price_history[table_of_price_history.book_id!=-1]
table_of_price_history.to_csv('./price-history.csv',index=False,encoding='utf-8')
table_of_price_history

Unnamed: 0,book_id,price,discount,date
0,14109,350000,20,2023-09-25 14:48:38.765855
1,19941,80000,25,2023-09-25 14:48:39.757057
2,87800,36000,25,2023-09-25 14:48:40.198631
3,105323,69000,25,2023-09-25 14:48:40.423872
4,25671,78000,0,2023-09-25 14:48:40.574690
...,...,...,...,...
106,64191,90000,0,2023-09-25 14:49:00.951394
107,16402,95000,25,2023-09-25 14:49:01.030431
108,101751,160000,25,2023-09-25 14:49:01.106442
109,89058,48000,25,2023-09-25 14:49:01.184855


In [48]:
book_veneration_data_list=list(filter(bool, book_veneration_data_list))
table_of_book_veneration=pd.DataFrame(book_veneration_data_list).drop_duplicates(subset=['site_index','English_Quote','Persian_Quote','Prise_Writer'])
table_of_book_veneration.to_csv('./book_veneration.csv',index=False,encoding='utf-8')
table_of_book_veneration

Unnamed: 0,site_index,English_Quote,Persian_Quote,Prise_Writer
0,23,One of Faulkner’s comic masterpieces.,از شاهکارهای کمیک فاکنر,barnes and noble
1,61,Convincing and compelling.,باورپذیر و مهیج.,School Library Journal
2,61,"A highly imaginative, absolutely terrific firs...",یک رمان نخست فوق العاده خیال پردازانه و شگرف.,Barnes & Noble
3,61,"An exciting, clever read.",داستانی هیجان انگیز و هوشمندانه.,Booktopia


In [49]:
table_of_award=pd.DataFrame(site_award_data_list).drop_duplicates(subset=['site_index','award'])
table_of_award.to_csv('./award.csv',index=False,encoding='utf-8')
table_of_award

Unnamed: 0,site_index,award
0,23,برنده ی جایزه ی پولیتزر سال ۱۹۶۳
1,47,برنده جایزه پولیتزر
2,47,برنده جایزه ی نمایشنامه ی حلقه ی منتقدین نیو...
3,47,برنده جایزه Tony سال 1987
4,61,نامزد جایزه کتاب ناشر مستقل سال 1999
5,61,برنده جایزه خواننده گرند کنیون سال 1998
6,61,نامزد مدال خوانندگان جوان کالیفرنیا سال 1998
7,61,برنده جایزه سکویا اوکلاهما سال 1998
