In [854]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import logging
import concurrent.futures
import threading
from time import sleep
from datetime import datetime

In [855]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [856]:
def get_response(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    return response

In [857]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [858]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [859]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [860]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))

In [861]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price

In [862]:
def get_score(soup):
    soup = soup.find('div', {'class': 'col-md-7'}).find('li', {'class': 'pull-left'}).find('div',
                                                                                           {'class': 'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating


In [863]:
def get_publisher(soup):
    try:
        publisher_tag = soup.select('div.prodoct-attribute-items:nth-child(1) > a')[0]
        publisher_link=publisher_tag.get('href')
        publisher_id=publisher_link.split('/')[2].split('-')[0]
        publisher_name=publisher_tag.text.strip()
    except Exception:
        publisher_link=-1
        publisher_id=-1
        publisher_name=-1
    return {'id':publisher_id,'name':publisher_name,'link':publisher_link}

In [864]:
def get_author(soup):
    authors_list=[]          
    try:
        authors_a_tag = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items > a')
        if(len(authors_a_tag))==0:
                return authors_list
        for author_a_tag in authors_a_tag:
                author_link=author_a_tag.get('href')
                author_id=author_link.split('/')[2].split('-')[0]
                author_name=author_a_tag.text.strip()
                authors_list.append({'id':author_id,'name':author_name,'link':author_link})
    except Exception:
         authors_list=[]         
    return authors_list

In [865]:
def is_author_available(soup):
    try:
        existence = soup.select('.pull-left+ li span')[0].text
    except:
        existence = None
        logging.exception("This book has no author!")
    return existence


In [866]:
def get_book_attribute(soup):
    rows = soup.find('table', {'class': 'product-table'}).findAll('td')
    code = -1
    isbn = -1
    size = -1
    pages = -1
    per_cal = -1
    ad_cal = -1
    material = -1
    series = -1
    send_time = -1
    language = 'فارسی'
    translators=[]
    paper_type=-1
    code_flag = 0
    isbn_flag = 0
    size_flag = 0
    pages_flag = 0
    per_cal_flag = 0
    ad_cal_flag = 0
    material_flag = 0
    language_flag = 0
    series_flag = 0
    send_time_flag = 0
    translators_flag=0
    paper_type_flag=0
    
    for row in rows:
        text = row.text.strip()
        if code_flag == 1:
            code = int(text)
            code_flag = 0
        elif isbn_flag == 1:
            isbn = text
            isbn = re.sub('[^0-9-]', '', isbn)
            isbn_flag = 0
        elif size_flag == 1:
            size = text
            size_flag = 0
        elif pages_flag == 1:
            pages = int(text)
            pages_flag = 0
        elif per_cal_flag == 1:
            per_cal = int(text)
            per_cal_flag = 0
        elif ad_cal_flag == 1:
            ad_cal = int(text)
            ad_cal_flag = 0
        elif material_flag == 1:
            material = text
            material_flag = 0
        elif language_flag == 1:
            language = text
            language_flag = 0
        elif series_flag == 1:
            series = int(text)
            series_flag = 0
        elif send_time_flag == 1:
            send_time = text
            send_time_flag = 0
        elif paper_type_flag == 1:
            paper_type = text
            paper_type_flag = 0
        elif translators_flag == 1:
            translators_a_tag = row.select('a')
            translators_flag=0
            for a_tag in translators_a_tag:
                translators.append({'id':a_tag.get('href').split('/')[2].split('-')[0],'name':a_tag.text.strip(),'link':a_tag.get('href')})


        if 'کد کتاب' in text:
            code_flag = 1
        elif 'شابک' in text:
            isbn_flag = 1
        elif 'قطع' in text:
            size_flag = 1
        elif 'تعداد صفحه' in text:
            pages_flag = 1
        elif 'سال انتشار شمسی' in text:
            per_cal_flag = 1
        elif 'سال انتشار میلادی' in text:
            ad_cal_flag = 1
        elif 'نوع جلد' in text:
            material_flag = 1
        elif 'زبان کتاب' in text:
            language_flag = 1
        elif 'سری چاپ' in text:
            series_flag = 1
        elif 'زودترین زمان ارسال' in text:
            send_time_flag = 1
        elif 'مترجم' in text:
            translators_flag = 1
        elif   'نوع کاغذ' in text:
            paper_type_flag=1


    return [code, isbn, size, pages, per_cal, ad_cal, material, series,language, send_time,translators,paper_type]

In [867]:
def get_summary(soup):
    summary = soup.select('.product-description')[0].text.strip()
    return summary

In [868]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [869]:
def get_book_detail(book_soup, site_index):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)
    book_author_presence = is_author_available(book_soup)

    [book_code, book_Isbn, book_size, book_pages, book_publication_per_date, book_publication_ad_date,
     book_cover_material, book_print_series,book_language, book_earliest_send_time,book_translators,paper_type] = get_book_attribute(book_soup)
    print(book_size)
    price_history_data_list.append({'book_id':int(book_code),'price':book_price,'discount':int(book_discount_percent),'date':str(datetime.today())})
  
    publishers_data_list.append(book_publisher)  #8th column is dict of publisher
    

    #writer
    writer_page_data_list.extend(book_author)     #9th index is the writer column which is a list of writers
    writers_list_of_dict=book_author.copy()        #writes list which is a list of dictionary
    if len(writers_list_of_dict)!=0:
        for w_id in writers_list_of_dict:
            books_writers_data_list.append({'book_id':int(book_code),'writer_id':w_id['id']})

    #translator

    translator_page_data_list.extend(book_translators)
    translators_list_of_dict=book_translators.copy()        #writes list which is a list of dictionary
    if len(translators_list_of_dict)!=0:
        for t_id in translators_list_of_dict:
            books_translators_data_list.append({'book_id':int(book_code),'translator_id':t_id['id']})
    book_publisher=book_publisher['id']               #convert 8th column from dict to the publisher's id
    book_data = [site_index, int(book_code), book_Isbn, book_fa_title, book_en_title,
                 book_score, book_publisher,
                 int(book_pages), int(book_publication_per_date), int(book_publication_ad_date), book_size,
                 book_cover_material,
                  int(book_print_series),book_language, book_earliest_send_time, book_author_presence,paper_type]
    return book_data

In [870]:
def get_book_site_veneration(soup, site_index):
    div = soup.find('div', attrs = {'class':'col-md-6 col-xs-12'})
    ven_lst = list()
    try:
        english_bars = div.find_all('div', attrs = {'class':'english-bar ltr'})
        persian_bars = div.find_all('div', attrs = {'class':'persian-bar'})
        prise_writers = div.find_all('div', attrs = {'class':'prise-writer ltr'})
        
        n = len(english_bars)
        for i in range(n):
            english_quote = english_bars[i].text.strip()
            persian_quote = persian_bars[i].text.strip()
            prise_writer = prise_writers[i].text.strip()

            ven_dic = {'site_index':site_index,
                       'English_Quote': english_quote,
                       'Persian_Quote': persian_quote,
                       'Prise_Writer': prise_writer}
            ven_lst.append(ven_dic)
        return(ven_lst)
    except:
        return ven_lst

In [871]:
def get_book_site_summary(book_soup, site_index):
    try:
        book_summary = get_summary(book_soup)
    except Exception:
        book_summary = None
        logging.exception("This book has no summary!")
    return [site_index, book_summary]

In [872]:
def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index, tag]]
    return book_tags_list

In [873]:
def get_book_site_awards(book_soup, site_index):
    awards=[]
    n=len(book_soup.select('.product-features h4'))
    for i in range(0,n):
        awards.append({'site_index':site_index,'award':book_soup.select('.product-features h4')[i].text})
    return awards

In [874]:
def get_req_list(list, req_count):
    if len(list) >= req_count:
        request_list = list[:req_count].copy()
    else:
        request_list = list.copy()
    return request_list

In [875]:
def scrape(site_soup):
    try:
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")

In [876]:
def fast_scrape(link):
    try:
        site_soup = get_soup(link)
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            book_veneration_data_list.extend(get_book_site_veneration(site_soup,site_index))
            site_award_data_list.extend(get_book_site_awards(site_soup,site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                # price_history_data_list.append({'book_id':data[1],'price':data[5],'discount':data[6],'date':str(datetime.today())})
                # writer_page_data_list.extend(data[9])     #9th index is the writer column which is a list of writers
                # publishers_data_list.append(data[8])  #8th column is dict of publisher
                # data[8]=data[8]['id']                 #convert 8th column from dict to the publisher's id
                # writers_list_of_dict=data[9]          #writes list which is a list of dictionary
                # data.pop(9)                           #remove 9th column from data(9th column was writers)
                # data.pop(5)                           #remove 5th column which is price column
                # data.pop(5)                           #remove 5th column which is discount after deleting price
                # for w_id in writers_list_of_dict:
                #      books_writers_data_list.append({'book_id':data[1],'writer_id':w_id['id']})
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")


<h1>Detailed Scraper</h1>

In [877]:
# links = get_links()[:200] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']

# page_response = []
# books_data_list = []
# site_tags_data_list = []
# site_summary_data_list = []

# site_index = 1
# sleep_time = 0.5
# max_threads = 20
# book_count_request = 20  #number of requests per time

# lock = threading.Lock()
# book_urls = links.copy()

# while len(book_urls):
#     sleep(sleep_time)  #sleep so that the site does not ban us
#     request_list = get_req_list(book_urls, book_count_request)  #list of book's urls we want to send request 
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
#         future_list = executor.map(get_response, request_list)
#         for future in future_list:
#             try:
#                 data = future
#                 page_response.append(data)
#             except Exception as exc:
#                 continue
#         for item in page_response:
#             if item.status_code == 200:
#                 page_url = item.url
#                 if page_url in request_list:
#                     page_soup = bs4.BeautifulSoup(item.content, 'html.parser')
#                     scrape(page_soup)
#                     book_urls.remove(page_url)


<h1>Fast Scraper</h1>

In [878]:
links = get_links()[:0] +['https://www.iranketab.ir/book/1017-the-compound-effect']
#+ ['https://www.iranketab.ir/book/270-gone-with-the-wind']

books_data_list = []
site_tags_data_list = []
site_summary_data_list = []
site_award_data_list=[]

writer_page_data_list=[]
translator_page_data_list=[]
publishers_data_list=[]
price_history_data_list=[]
book_veneration_data_list=[]
#middle tables
books_writers_data_list=[]
books_translators_data_list=[]


site_index = 1
max_threads = 20

lock = threading.Lock()

with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
    executor.map(fast_scrape, links)


ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):


رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
خشتی
رقعی
رقعی


ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):


رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی
رقعی


<h1>Check Completnes</h1>

In [879]:
# if len(book_urls) == 0:
#     print('All links scraped!')
# else:
#     print('Something wrong happened')

<h1>Make Dataframes</h1>

In [880]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_index', 'code', 'Isbn', 'fa_title', 'en_title', 'score',
                                    'publisher_id', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series','language' ,'earliest_send_time', 'presence','paper_type'])
tableOfData=tableOfData[tableOfData['code']!=-1]

In [881]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence,paper_type
0,1,1017,978-600-97778-1-5,کتاب اثر مرکب,The Compound Effect,3.642857,1299,237,1402,2010,رقعی,شومیز,فارسی,72,5 مهر,موجود,-1
1,1,27315,978-9642366842,اثر مرکب,The Compound Effect,3.5,1153,248,1402,2010,رقعی,شومیز,فارسی,152,5 مهر,موجود,-1
2,1,64482,978-6007133590,اثر مرکب,The Compound Effect,3.38,1207,224,1401,2010,رقعی,شومیز,فارسی,12,5 مهر,موجود,-1
3,1,4384,9786007228869,اثر مرکب,The Compound Effect,3.88,1246,256,1400,2010,رقعی,شومیز,فارسی,10,5 مهر,موجود,-1
4,1,4379,9786009489640,اثر مرکب,The Compound effect,3.3,1267,251,1402,2010,رقعی,شومیز,فارسی,36,5 مهر,موجود,-1
5,1,98216,978-6008593782,اثر مرکب,The Compound Effect,,1306,254,1401,2010,رقعی,شومیز,فارسی,8,8 مهر,موجود,-1
6,1,2404,978-9646851702,اثر مرکب,The Compound Effect,3.35,1442,254,1401,2010,رقعی,شومیز,فارسی,5,5 مهر,موجود,-1
7,1,22545,978-6226573498,اثر مرکب,The Compound Effect,3.55,61,168,1402,2010,رقعی,شومیز,فارسی,25,5 مهر,موجود,-1
8,1,92579,978-6009896141,اثر مرکب,The Compound Effect,3.99,1869,144,1401,2010,رقعی,شومیز,فارسی,10,5 مهر,موجود,-1
9,1,112068,978-6006629537,اثر مرکب,The Compound Effect,3.69,3156,175,1401,2010,رقعی,شومیز,فارسی,2,8 مهر,موجود,-1


In [882]:
table_of_cover_type=pd.DataFrame(tableOfData['cover_material'].drop_duplicates())
table_of_cover_type=table_of_cover_type[table_of_cover_type['cover_material']!=-1].reset_index(drop=True)
table_of_cover_type.to_csv('./cover_type.csv',encoding='utf-8')
table_of_cover_type

Unnamed: 0,cover_material
0,شومیز


In [883]:
table_of_format=pd.DataFrame(tableOfData['size'].drop_duplicates())
table_of_format=table_of_format[table_of_format['size']!=-1].reset_index(drop=True)
table_of_format.to_csv('./format.csv',encoding='utf-8')
table_of_format

Unnamed: 0,size
0,رقعی
1,خشتی


In [884]:
def convert_size_to_int(size):
    try:
     return table_of_format.index[table_of_format['size']==str(size)].to_list()[0]
    except:
        return -1

tableOfData['size']=tableOfData['size'].apply(convert_size_to_int)

In [885]:
def convert_cover_type_to_int(material):
    try:
     return table_of_cover_type.index[table_of_cover_type['cover_material']==str(material)].to_list()[0]
    except:
        return -1

tableOfData['cover_material']=tableOfData['cover_material'].apply(convert_cover_type_to_int)

In [886]:

tableOfData.to_csv("bookData.csv", index=False, encoding='utf-8')

In [887]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence,paper_type
0,1,1017,978-600-97778-1-5,کتاب اثر مرکب,The Compound Effect,3.642857,1299,237,1402,2010,0,0,فارسی,72,5 مهر,موجود,-1
1,1,27315,978-9642366842,اثر مرکب,The Compound Effect,3.5,1153,248,1402,2010,0,0,فارسی,152,5 مهر,موجود,-1
2,1,64482,978-6007133590,اثر مرکب,The Compound Effect,3.38,1207,224,1401,2010,0,0,فارسی,12,5 مهر,موجود,-1
3,1,4384,9786007228869,اثر مرکب,The Compound Effect,3.88,1246,256,1400,2010,0,0,فارسی,10,5 مهر,موجود,-1
4,1,4379,9786009489640,اثر مرکب,The Compound effect,3.3,1267,251,1402,2010,0,0,فارسی,36,5 مهر,موجود,-1
5,1,98216,978-6008593782,اثر مرکب,The Compound Effect,,1306,254,1401,2010,0,0,فارسی,8,8 مهر,موجود,-1
6,1,2404,978-9646851702,اثر مرکب,The Compound Effect,3.35,1442,254,1401,2010,0,0,فارسی,5,5 مهر,موجود,-1
7,1,22545,978-6226573498,اثر مرکب,The Compound Effect,3.55,61,168,1402,2010,0,0,فارسی,25,5 مهر,موجود,-1
8,1,92579,978-6009896141,اثر مرکب,The Compound Effect,3.99,1869,144,1401,2010,0,0,فارسی,10,5 مهر,موجود,-1
9,1,112068,978-6006629537,اثر مرکب,The Compound Effect,3.69,3156,175,1401,2010,0,0,فارسی,2,8 مهر,موجود,-1


In [888]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list, columns=['site_index', 'summary']).drop_duplicates(subset=['site_index','summary'])
tableOfSummaryData=tableOfSummaryData[tableOfSummaryData.notnull().all(axis=1)]
tableOfSummaryData

Unnamed: 0,site_index,summary
0,1,کتاب اثر مرکب، اثری نوشته ی دارن هاردی است که ...


In [889]:
tableOfSummaryData.to_csv( "BookSummaryData.csv", index=False, encoding='utf-8')

In [890]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list, columns=['site_index', 'tag'])\
    .drop_duplicates(subset=['site_index','tag'])
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,ادبیات آمریکا
1,1,ادبیات معاصر
2,1,دهه 2010 میلادی
3,1,پرفروش های ایران کتاب
4,1,روانشناسی
5,1,پرفروش ترین کتاب ها
6,1,تجارت و کسب و کار
7,1,خودپروری
8,1,کتاب هایی که زندگی تان را دگرگون خواهند کرد


In [891]:
table_of_tag=pd.DataFrame(tableOfSiteTagsData['tag']).drop_duplicates(subset=['tag']).reset_index(drop=True)
table_of_tag.to_csv('./tag.csv',encoding='utf-8')
table_of_tag

Unnamed: 0,tag
0,ادبیات آمریکا
1,ادبیات معاصر
2,دهه 2010 میلادی
3,پرفروش های ایران کتاب
4,روانشناسی
5,پرفروش ترین کتاب ها
6,تجارت و کسب و کار
7,خودپروری
8,کتاب هایی که زندگی تان را دگرگون خواهند کرد


In [892]:
def convert_tag_to_int(tag):
    try:
     return table_of_tag.index[table_of_tag['tag']==str(tag)].to_list()[0]
    except:
        return -1

tableOfSiteTagsData['tag']=tableOfSiteTagsData['tag'].apply(convert_tag_to_int)

In [893]:
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,0
1,1,1
2,1,2
3,1,3
4,1,4
5,1,5
6,1,6
7,1,7
8,1,8


In [894]:
tableOfSiteTagsData.to_csv('bookTagsData.csv',index=False,encoding='utf-8')

In [895]:
table_of_publisher=pd.DataFrame(publishers_data_list).drop_duplicates(subset=['id','name','link'])
table_of_publisher.to_csv('./publisher.csv',index=False,encoding='utf-8')
table_of_publisher

Unnamed: 0,id,name,link
0,1299,نگاه نوین,/publisher/1299-%d9%86%da%af%d8%a7%d9%87-%d9%8...
1,1153,نسل نواندیش,/publisher/1153-%d9%86%d8%b3%d9%84-%d9%86%d9%8...
2,1207,آرایان,/publisher/1207-%d8%a2%d8%b1%d8%a7%db%8c%d8%a7...
3,1246,آموخته,/publisher/1246-%d8%a2%d9%85%d9%88%d8%ae%d8%aa...
4,1267,کتیبه پارسی,/publisher/1267-%da%a9%d8%aa%db%8c%d8%a8%d9%87...
5,1306,پندار تابان,/publisher/1306-%d9%be%d9%86%d8%af%d8%a7%d8%b1...
6,1442,سما,/publisher/1442-%d8%b3%d9%85%d8%a7
7,61,میلکان,/publisher/61-%d9%85%db%8c%d9%84%da%a9%d8%a7%d...
8,1869,ندای معاصر (زرین کلک),/publisher/1869-%d9%86%d8%af%d8%a7%db%8c-%d9%8...
9,3156,تیموری,/publisher/3156-%d8%aa%db%8c%d9%85%d9%88%d8%b1...


In [896]:
books_writers_data_list=list(filter(bool, books_writers_data_list))
table_of_writer=pd.DataFrame(books_writers_data_list).drop_duplicates(subset=['book_id','writer_id'])
#drop rows which both book_id and writer_id is -1
table_of_writer=table_of_writer[(table_of_writer['book_id']!=-1) & (table_of_writer['writer_id']!=-1)]
table_of_writer.to_csv('./writer.csv',index=False,encoding='utf-8')
table_of_writer

Unnamed: 0,book_id,writer_id
0,1017,2245
1,27315,2245
2,64482,2245
3,4384,2245
4,4379,2245
5,98216,2245
6,2404,2245
7,22545,2245
8,92579,2245
9,112068,2245


In [897]:
table_of_writer_page=pd.DataFrame(writer_page_data_list).drop_duplicates(subset=['id','name','link'])
table_of_writer_page.to_csv('./writer_page.csv',index=False,encoding='utf-8')
table_of_writer_page

Unnamed: 0,id,name,link
0,2245,دارن هاردی,/profile/2245-darren-hardy


In [898]:
table_of_translator=pd.DataFrame(books_translators_data_list).drop_duplicates(subset=['book_id','translator_id'])
table_of_translator=table_of_translator[(table_of_translator['book_id']!=-1) & (table_of_translator['translator_id']!=-1)]
table_of_translator.to_csv('./translator.csv',index=False,encoding='utf-8')
table_of_translator


Unnamed: 0,book_id,translator_id
0,1017,1043
1,1017,1044
2,64482,2511
3,4384,4294
4,4379,3158
5,98216,1015
6,2404,6987
7,92579,49397
8,112068,66278
9,60400,33228


In [899]:
table_of_translator_page=pd.DataFrame(translator_page_data_list).drop_duplicates(subset=['id','name','link'])
table_of_translator_page.to_csv('translator_page.csv',index=False,encoding='utf-8')
table_of_translator_page

Unnamed: 0,id,name,link
0,1043,لطیف احمدپور,/profile/1043-latif-ahmadpour
1,1044,میلاد حیدری,/profile/1044-milad-heydari
2,2511,گیتی شهیدی,/profile/2511-giti-shahidi
3,4294,علیرضا خاکساران,/profile/4294-%d8%b9%d9%84%db%8c%d8%b1%d8%b6%d...
4,3158,افسانه درویشی,/profile/3158-%d8%a7%d9%81%d8%b3%d8%a7%d9%86%d...
5,1015,فرخ بافنده,/profile/1015-farokh-bafandeh
6,6987,شهرام ظریف,/profile/6987-%d8%b4%d9%87%d8%b1%d8%a7%d9%85-%...
7,49397,سعیده کافی,/profile/49397-%d8%b3%d8%b9%db%8c%d8%af%d9%87-...
8,66278,مریم مسیبی,/profile/66278-%d9%85%d8%b1%db%8c%d9%85-%d9%85...
9,33228,زهرا ایجادی,/profile/33228-%d8%b2%d9%87%d8%b1%d8%a7-%d8%a7...


In [900]:
table_of_price_history=pd.DataFrame(price_history_data_list).drop_duplicates(subset=['book_id','price','discount','date'])
table_of_price_history=table_of_price_history[table_of_price_history.book_id!=-1]
table_of_price_history.to_csv('./price-history.csv',index=False,encoding='utf-8')
table_of_price_history

Unnamed: 0,book_id,price,discount,date
0,1017,140000,5,2023-09-25 11:47:57.857447
1,27315,169900,0,2023-09-25 11:47:57.869765
2,64482,125000,0,2023-09-25 11:47:57.881447
3,4384,158000,0,2023-09-25 11:47:57.892482
4,4379,105000,0,2023-09-25 11:47:57.903481
5,98216,264500,0,2023-09-25 11:47:57.914480
6,2404,128000,0,2023-09-25 11:47:57.924385
7,22545,165000,0,2023-09-25 11:47:57.935385
8,92579,170000,0,2023-09-25 11:47:57.945449
9,112068,89000,0,2023-09-25 11:47:57.955388


In [901]:
book_veneration_data_list=list(filter(bool, book_veneration_data_list))
table_of_book_veneration=pd.DataFrame(book_veneration_data_list).drop_duplicates(subset=['site_index','English_Quote','Persian_Quote','Prise_Writer'])
table_of_book_veneration.to_csv('./book_veneration.csv',index=False,encoding='utf-8')
table_of_book_veneration

Unnamed: 0,site_index,English_Quote,Persian_Quote,Prise_Writer
0,1,A real program that can change your life and m...,برنامه ای واقعی که می تواند زندگی تان را تغییر...,"David Bach, Author"
1,1,A treasure chest of ideas for achieving great ...,گنجینه ای از ایده ها و تفکرات برای رسیدن به مو...,"Brian Tracy, Author"
2,1,It shows us how to make lasting changes by pay...,این اثر نشان می دهد که چگونه از طریق توجه کردن...,Blinkist


In [902]:
table_of_award=pd.DataFrame(site_award_data_list).drop_duplicates(subset=['site_index','award'])
table_of_award.to_csv('./award.csv',index=False,encoding='utf-8')
table_of_award

Unnamed: 0,site_index,award
0,1,جزو پرفروش ترین کتاب ها و رمان های ایران
1,1,جزو فهرست کتاب هایی که زندگی تان را دگرگون خ...
