# Description
We parse reviews from the site **sravni.ru**

1. We get the url of the reviews from the pages, with the url address https://www.sravni.ru/banki/otzyvy/?page=num, where **num** is the page number

2. Save data in **csv** format, fields: id, url, bank, rating

3. Upload a file with links to reviews pages

4. Open the page with reviews, parse the data

5. Save data in **csv** format

# Import

In [1]:
import time
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Settings

In [3]:
PAGES_NUM = 1500  # Pages for parsing, 20 reviews per page
PART = 1  # Part of the downloaded data (STEP_PARTS), 500 pages
STEP_PARTS = 500  # downloading step
DIR = '/content/drive/MyDrive/Colab Notebooks/DAN/feedback_tracking/files'
URL = 'https://www.sravni.ru/banki/otzyvy/?page='  # Example: 'https://www.sravni.ru/banki/otzyvy/?page=1'

# Parsing

In [4]:
time_start = time.time()
data_list = []
part_start = (PART - 1)*STEP_PARTS + 1
part_end = part_start + STEP_PARTS + 1

for n in range(part_start, part_end):
  r = requests.get(URL + str(n))
  soup = BeautifulSoup(r.text, 'html.parser')

  # Iterate over the elements on the page and the available data
  items = soup.find_all("div", {"class": "sc-1fxln1u-15 dYgtcQ"})
  for item in items:
    rating_node = item.find_all("span", {"class": "sc-1eq8x10-0 fIgnKU"})
    if not rating_node:
      continue
    rating = rating_node[0].string
    span = item.find_all("span", {"class": "sc-1fxln1u-27 cPAIqy"})[0]
    link = span.a['href']
    link_list = link.split('/')
    bank = link_list[2]
    id = link_list[4]
    row = [id, link, bank, rating]
    data_list.append(row)

  time_delta = round(time.time() - time_start)
  if n % 10 == 0:
    print(f'Page num: {n}, time: {time_delta} s.')

Page num: 10, time: 20 s.
Page num: 20, time: 36 s.
Page num: 30, time: 52 s.
Page num: 40, time: 72 s.
Page num: 50, time: 95 s.
Page num: 60, time: 118 s.
Page num: 70, time: 138 s.
Page num: 80, time: 156 s.
Page num: 90, time: 175 s.
Page num: 100, time: 193 s.
Page num: 110, time: 215 s.
Page num: 120, time: 235 s.
Page num: 130, time: 260 s.
Page num: 140, time: 279 s.
Page num: 150, time: 298 s.
Page num: 160, time: 314 s.
Page num: 170, time: 336 s.
Page num: 180, time: 353 s.
Page num: 190, time: 374 s.
Page num: 200, time: 391 s.
Page num: 210, time: 411 s.
Page num: 220, time: 431 s.
Page num: 230, time: 447 s.
Page num: 240, time: 471 s.
Page num: 250, time: 494 s.
Page num: 260, time: 512 s.
Page num: 270, time: 534 s.
Page num: 280, time: 554 s.
Page num: 290, time: 571 s.
Page num: 300, time: 588 s.
Page num: 310, time: 613 s.
Page num: 320, time: 630 s.
Page num: 330, time: 649 s.
Page num: 340, time: 673 s.
Page num: 350, time: 693 s.
Page num: 360, time: 711 s.
Page n

# Save dataset

In [5]:
df = pd.DataFrame(data_list)
df.columns = ['id', 'link', 'bank', 'rating']

file_path = f'{DIR}/links_{PART}.csv'
df.to_csv(file_path)

In [6]:
df.shape

(5770, 4)

In [7]:
df.head(5)

Unnamed: 0,id,link,bank,rating
0,536617,/bank/tochka/otzyvy/536617,tochka,5
1,536578,/bank/gazprombank/otzyvy/536578,gazprombank,5
2,536577,/bank/gazprombank/otzyvy/536577,gazprombank,5
3,536567,/bank/gazprombank/otzyvy/536567,gazprombank,5
4,536564,/bank/gazprombank/otzyvy/536564,gazprombank,5
