# Description
Parsing review pages. Links are taken from the links.csv file

# Import

In [31]:
import time
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Settings

In [33]:
START_IDX = 0
BATCH = 1000 
DIR = '/content/drive/MyDrive/Colab Notebooks/DAN/feedback_tracking/files'

# Load links data

In [34]:
df = pd.read_csv(f'{DIR}/links.csv')

In [35]:
df.shape

(19524, 4)

In [36]:
df.head()

Unnamed: 0,id,link,bank,rating
0,536617,/bank/tochka/otzyvy/536617,tochka,5
1,536578,/bank/gazprombank/otzyvy/536578,gazprombank,5
2,536577,/bank/gazprombank/otzyvy/536577,gazprombank,5
3,536567,/bank/gazprombank/otzyvy/536567,gazprombank,5
4,536564,/bank/gazprombank/otzyvy/536564,gazprombank,5


# Parsing

In [37]:
time_start = time.time()
data_list = []

for i in range(START_IDX, START_IDX+BATCH):
  if i >= df.shape[0]:
    break
  link = 'https://sravni.ru' + df.loc[i, 'link']
  r = requests.get(link)
  soup = BeautifulSoup(r.text, 'html.parser')
  item = soup.find_all("div", {"class": "sc-lpzn2g-9 bHspFi"})[0]

  id = int(df.loc[i, 'id'])
  title = item.find_all("h1", {"sc-lpzn2g-11 ljWdwX"})[0].string
  text_body = item.find_all("div", {"sc-lpzn2g-13 eCTrNq"})[0].getText()
  text = (title + ' ' + text_body.replace('\n', ''))
  r_5 = int(df.loc[i, 'rating'])
  bank = df.loc[i, 'bank']
  rating = 1  # 0, 1, 2
  if r_5 < 3:
    rating = 0
  if r_5 > 3:
    rating = 2

  row = [id, text, bank, rating]
  data_list.append(row)

  time_delta = round(time.time() - time_start)
  if i % 50 == 0:
    print(f'Comment idx: {i}, time: {time_delta} s.')

Comment idx: 0, time: 3 s.
Comment idx: 50, time: 99 s.
Comment idx: 100, time: 199 s.
Comment idx: 150, time: 300 s.
Comment idx: 200, time: 406 s.
Comment idx: 250, time: 502 s.
Comment idx: 300, time: 594 s.
Comment idx: 350, time: 695 s.
Comment idx: 400, time: 792 s.
Comment idx: 450, time: 880 s.
Comment idx: 500, time: 981 s.
Comment idx: 550, time: 1079 s.
Comment idx: 600, time: 1172 s.
Comment idx: 650, time: 1269 s.
Comment idx: 700, time: 1362 s.
Comment idx: 750, time: 1455 s.
Comment idx: 800, time: 1556 s.
Comment idx: 850, time: 1648 s.
Comment idx: 900, time: 1739 s.
Comment idx: 950, time: 1836 s.


# Save dataset

In [38]:
df = pd.DataFrame(data_list)
df.columns = ['id', 'text', 'bank', 'rating']

n = int(START_IDX / BATCH)

file_path = f'{DIR}/data_{n}.csv'
df.to_csv(file_path)

In [39]:
df.shape

(1000, 4)

In [40]:
df.head(5)

Unnamed: 0,id,text,bank,rating
0,536617,"Очень довольны работой банка Работаем с ""Точко...",tochka,2
1,536578,Взять кредит— элементарно! Форма заявки элемен...,gazprombank,2
2,536577,Банк №1 в России Мы с мужем клиенты с 2019 год...,gazprombank,2
3,536567,Банк и карточка для жизни Потому что все удобн...,gazprombank,2
4,536564,Реклама кредита точно сработала Последний меся...,gazprombank,2
