In [9]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import csv
import cv2 # importing OpenCV
import numpy as np # importing Numpy in order to work with arrays
import matplotlib.pyplot as plt # importing matplotlib.pyplot to display arrays directly

### Parsing

In [None]:
def create_csv(books):
    field_names =  ['name', 'isbn', 'author', 'genre', 'publishing_house', 'publish_year', 'lang', 'publish_country']

    with open('books.csv', 'a', encoding='utf-8', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=field_names)
        writer.writeheader()
        for row in books:
            writer.writerow(row)
        print(".csv file created!")

In [2]:
def get_request(URL):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }

    response = requests.get(URL, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        print(f'Request failed with status code: {response.status_code}')

    return None

In [3]:
def get_links(data, num):
    links = [];
    if num == 0:
      ol = data.find('div', attrs={'class':'good-list' })
      lists = ol.findAll('div', attrs={'class':'good-list-item' })
      for li in lists:
          a_tag = li.find('a', attrs={'class':'pic l-h-250' })
          if(a_tag):
              link = 'https://m.flip.kz' + a_tag.get('href')
              links.append(link)
    elif num == 1:
      ol = data.find('ol', attrs={'class':'products list items product-items' })
      lists = ol.findAll('li')

      for li in lists:
        a_tag = li.find('a', attrs={'class':'product-item-link' })
        if(a_tag):
          link = a_tag.get('href')
          links.append(link)
    return links

In [4]:
def add_book1(soup):

    book = {'isbn': '',
            'book_name': '',
            'author': '',
            'genres':'Литература',
            'publisher':'',
            'published_year':'',
            'languages':'Казахский',
            'publisher_country':'Казахстан'}

    #book_name
    table = soup.find('table', attrs={'id':'prod' })
    if(table):
        name = table.find('span', attrs={'itemprop':'name' })
    else:
        name_div = soup.find('div', attrs={'id':'produce-parts-title' })
        name = name_div.find('h1', attrs={'itemprop':'name' })

    if(name):
        book['book_name'] = name.text

    #author
    if(table):
        author = table.find('p', attrs={'style':'margin-bottom: 0px;' })
    else:
        name_div = soup.find('div', attrs={'id':'produce-parts-title' })
        author = name_div.find('div', attrs={'class':'table-cell p-r-8' })

    if(author):
        author = author.find('a')
    if(author):
         book['author'] = author.text


    rows = soup.find('div', attrs={'class':'accordion' })
    infos = rows.findAll('div', attrs={'class':'row' })

    for i in range(0, len(infos)):
        get_row = infos[i].findAll('div', attrs={'class':'cell' })
        if(get_row[0].text == "ISBN"):
            book['isbn'] = get_row[1].text
        if(get_row[0].text == "Издательство"):
            book['publisher'] = get_row[1].text
        if(get_row[0].text == "Дата выхода"):
            book['published_year'] = get_row[1].text

    return book

In [5]:
def add_book2(soup):
    book = {'name': '', 'isbn': '', 'author': '', 'genre':'', 'publishing_house':'', 'publish_year':'', 'lang':'', 'publish_country':''}
    book_name = soup.find('h1', attrs={'class':'page-title' })
    if(book_name):
         book_name = book_name.find('span', attrs={'class':'base'})
    if(book_name):
         book_name = book_name.text
    book['name'] = book_name

    table = soup.find('table', attrs={'class': 'data table additional-attributes'})
    if(table):
        tbody = table.find('tbody')

        book_isbn = tbody.find('td', attrs={'data-th':'ISBN' })
        if(book_isbn):
            book_isbn = book_isbn.text

        td_author = tbody.find('td', attrs={'data-th':'Автор' })
        if(td_author):
            td_author =  td_author.find('a')
        if(td_author):
            book_author = td_author.text
            book['author'] = book_author

        book_genre = tbody.find('td', attrs={'data-th':'Жанр' })
        if(book_genre):
             book_genre = book_genre.text

        td_publishing_house = tbody.find('td', attrs={'data-th':'Издательство' })
        if(td_publishing_house):
            td_publishing_house = td_publishing_house.find('a')


        if(td_publishing_house):
            book_publishing_house = td_publishing_house.text
            book['publishing_house'] =  book_publishing_house

        book_publish_year = tbody.find('td', attrs={'data-th':'Год издания' })
        if(book_publish_year):
            book_publish_year = book_publish_year.text

        book_lang = tbody.find('td', attrs={'data-th':'Язык' })
        if( book_lang):
            book_lang =  book_lang.text

        td_publish_country = tbody.find('td', attrs={'data-th':'Страна производитель' })
        if(td_publish_country):
            book_publish_country = td_publish_country.find('a')
        if(td_publish_country):
            book_publish_country = td_publish_country.text
            book['publish_country'] = book_publish_country

        book['isbn'] = book_isbn
        book['genre'] = book_genre
        book['publish_year'] = book_publish_year
        book['lang'] =  book_lang
    return book

In [6]:
def get_books(num):
    books = []
    for i in range(0, len(links[num])):
        data = get_request(links[num][i])
        if(data):
          if num == 0:
            books.append(add_book1(data))
          if num == 1:
            books.append(add_book2(data))
    return books

In [7]:
last_page_number = 28
links = [[],[]]
for i in range(1, last_page_number):
  URL = f"https://www.flip.kz/catalog?subsection=1&filter-show=1&filter-a5051=18&page={i}"
  data = get_request(URL)
  if(data):
      links[0] += get_links(data, 0)
  else:
      print("Error")

In [None]:
last_page_number = 335
for i in range(1, last_page_number):
  URL = f"https://www.marwin.kz/books/?p={i}&stock=3"
  data = get_request(URL)
  if(data):
      links[1] += get_links(data, 1)
  else:
      print("Error")

In [8]:
books_0 = get_books(0)
books_0

KeyboardInterrupt: ignored

In [None]:
create_csv(books_0)

In [None]:
books_1 = get_books(1)
books_1

In [None]:
create_csv(books_1)

### Cleaning

In [None]:
df = pd.read_csv("books.csv")
df.head()

In [None]:
df.info()

In [None]:
df = df.dropna(subset=['publish_year'])
df['lang'] = df['lang'].astype('str')
df['isbn'] = df['isbn'].astype('str')
df['author'] = df['author'].astype('str')
df['genre'] = df['genre'].astype('str')
df['publish_country'] = df['publish_country'].astype('str')
df['publish_year'] = df['publish_year'].astype('int')
df['publishing_house'] = df['publishing_house'].astype('str')
df["lang"] = df["lang"].apply(lambda row : row.strip(" "))
df["name"] = df["name"].apply(lambda row : row.strip(" "))
df["isbn"] = df["isbn"].apply(lambda row : row.strip(" "))
df["publish_country"] = df["publish_country"].apply(lambda row : row.strip(" "))
df["author"] = df["author"].apply(lambda row : row.lstrip(" "))
df["genre"] = df["genre"].apply(lambda row : row.lstrip(" "))
df["publishing_house"] = df["publishing_house"].apply(lambda row : row.lstrip(" "))
df['languages'] = df['languages'].str.replace(' ', '')
columns_to_check = ['isbn', 'publisher', 'published_year', 'publisher_country']
df.dropna(subset=columns_to_check, inplace=True)

### Exploratory Data Analysis (EDA)

In [None]:
statistical_summary = df.describe()
statistical_summary

In [None]:
# Mean
mean_value = df.mean()

# Median
median_value = df.median()

# Mode (Note: Mode can be a Series as there may be multiple modes)
mode_value = df.mode().iloc[0]

# Standard Deviation
std_deviation = df.std()

# Quartiles
quartiles = df.quantile([0.25, 0.5, 0.75])

# Display individual statistics
print("Mean:\n", mean_value)
print("\nMedian:\n", median_value)
print("\nMode:\n", mode_value)
print("\nStandard Deviation:\n", std_deviation)
print("\nQuartiles:\n", quartiles)

In [None]:
fig, ax = plt.subplots()

publisher_year_count = df.groupby(['published_year']).size()

line = ax.plot(publisher_year_count)

plt.show()

In [None]:
fig, ax = plt.subplots()

genres_count = df.groupby(['genres']).size().sort_values().tail(5)

genres = ['\n'.join(i[0].split(' ')) for i in list(genres_count.items())]
counts = [i[1] for i in list(genres_count.items())]
bar_labels = [i[0] for i in list(genres_count.items())][::-1]
bar_colors = ['tab:red', 'tab:blue', 'tab:orange', 'tab:green', 'tab:red']

ax.bar(genres, counts, label=bar_labels, color=bar_colors)

ax.set_ylabel('Numbers of books')
ax.set_title('Genres')
ax.legend(title='Genres')

plt.show()

In [None]:
fig, ax = plt.subplots()
languages_count = df.groupby(['languages']).size().sort_values().tail(3)

languages = ['\n'.join(i[0].split(' ')) for i in list(languages_count.items())]
counts = [i[1] for i in list(languages_count.items())]
bar_labels = [i[0] for i in list(languages_count.items())][::-1]
bar_colors = ['tab:red', 'tab:blue', 'tab:orange', 'tab:green', 'tab:red']

ax.bar(languages, counts, label=bar_labels, color=bar_colors)

ax.set_ylabel('Numbers of books')
ax.set_title('Languages')
ax.legend(title='Languages')

plt.show()

In [None]:
fig, ax = plt.subplots()

publisher_count = df.groupby(['publisher']).size().sort_values().tail(5)

publisher = ['\n'.join(i[0].strip().split(' ')) for i in list(publisher_count.items())]
counts = [i[1] for i in list(publisher_count.items())]
bar_labels = [i[0] for i in list(publisher_count.items())][::-1]
bar_colors = ['tab:red', 'tab:blue', 'tab:orange', 'tab:green', 'tab:red']

ax.bar(publisher, counts, label=bar_labels, color=bar_colors)

ax.set_ylabel('Numbers of books')
ax.set_title('Publisher')
ax.legend(title='Publisher')

plt.show()

In [None]:
kz_df = df[df['languages'] == 'Казахский']
kz_df

In [None]:
fig, ax = plt.subplots()

publisher_year_count_kz = kz_df.groupby(['published_year']).size()

line = ax.plot(publisher_year_count_kz)

plt.show()

In [None]:
fig, ax = plt.subplots()

publisher_count = df.groupby(['published_year']).size().sort_values()

publisher = [i[0] for i in list(publisher_count.items())]
counts = [i[1] for i in list(publisher_count.items())]
bar_labels = [i[0] for i in list(publisher_count.items())][::-1]
bar_colors = ['tab:red', 'tab:blue', 'tab:orange', 'tab:green', 'tab:red']

ax.bar(publisher, counts, label=bar_labels, color=bar_colors)

ax.set_ylabel('Numbers of books')
ax.set_title('Publisher')
# ax.legend(title='Publisher')

plt.show()