In [1]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import logging
import random
from urllib.parse import urljoin
from datetime import datetime

# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('review_parser.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)

class ReviewParser:
    def __init__(self, base_url="https://irecommend.ru"):
        self.base_url = base_url
        self.session = requests.Session()
        self.setup_session()
        
    def setup_session(self):
        """–ù–∞—Å—Ç—Ä–æ–π–∫–∞ —Å–µ—Å—Å–∏–∏ —Å —Ä–µ–∞–ª–∏—Å—Ç–∏—á–Ω—ã–º–∏ –∑–∞–≥–æ–ª–æ–≤–∫–∞–º–∏"""
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Referer': 'https://irecommend.ru/',
        })

    def get_page(self, url, retries=3, delay=2):
        """–ü–æ–ª—É—á–µ–Ω–∏–µ —Å—Ç—Ä–∞–Ω–∏—Ü—ã —Å –æ–±—Ä–∞–±–æ—Ç–∫–æ–π –æ—à–∏–±–æ–∫"""
        for attempt in range(retries):
            try:
                time.sleep(random.uniform(delay, delay * 2))
                
                response = self.session.get(url, timeout=15)
                
                if response.status_code == 200:
                    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –ø–æ–ª—É—á–∏–ª–∏ –∫–æ–Ω—Ç–µ–Ω—Ç —Å –æ—Ç–∑—ã–≤–∞–º–∏
                    if 'smTeaser' in response.text or 'reviewBlock' in response.text:
                        return response.text
                    else:
                        logging.warning(f"–°—Ç—Ä–∞–Ω–∏—Ü–∞ {url} –Ω–µ —Å–æ–¥–µ—Ä–∂–∏—Ç –æ—Ç–∑—ã–≤–æ–≤")
                        return None
                else:
                    logging.warning(f"–°—Ç–∞—Ç—É—Å –∫–æ–¥ {response.status_code} –¥–ª—è {url}")
                    
            except Exception as e:
                logging.error(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –∑–∞–ø—Ä–æ—Å–µ {url} (–ø–æ–ø—ã—Ç–∫–∞ {attempt+1}): {e}")
            
            if attempt < retries - 1:
                time.sleep(delay * (attempt + 1))
        
        return None

    def parse_reviews_from_list(self, html):
        """–ü–∞—Ä—Å–∏–Ω–≥ –≤—Å–µ—Ö –æ—Ç–∑—ã–≤–æ–≤ —Å–æ —Å—Ç—Ä–∞–Ω–∏—Ü—ã —Å–ø–∏—Å–∫–∞"""
        soup = BeautifulSoup(html, 'html.parser')
        reviews_data = []
        
        # –ù–∞—Ö–æ–¥–∏–º –≤—Å–µ –±–ª–æ–∫–∏ —Å –æ—Ç–∑—ã–≤–∞–º–∏
        review_blocks = soup.find_all('div', class_='smTeaser')
        
        for block in review_blocks:
            review_data = self.parse_review_preview(block)
            if review_data:
                reviews_data.append(review_data)
        
        logging.info(f"–ù–∞–π–¥–µ–Ω–æ –æ—Ç–∑—ã–≤–æ–≤ –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ: {len(reviews_data)}")
        return reviews_data

    def parse_review_preview(self, block):
        """–ü–∞—Ä—Å–∏–Ω–≥ –ø—Ä–µ–≤—å—é –æ—Ç–∑—ã–≤–∞ –∏–∑ —Å–ø–∏—Å–∫–∞"""
        try:
            # –û—Å–Ω–æ–≤–Ω–∞—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è
            product_elem = block.find('div', class_='productName')
            product_name = product_elem.get_text(strip=True) if product_elem else "–ù–µ–∏–∑–≤–µ—Å—Ç–Ω—ã–π –ø—Ä–æ–¥—É–∫—Ç"
            
            author_elem = block.find('div', class_='authorName')
            author_name = author_elem.get_text(strip=True) if author_elem else "–ê–Ω–æ–Ω–∏–º"
            
            # –†–µ–π—Ç–∏–Ω–≥
            rating = self.extract_rating(block)
            
            # –î–∞—Ç–∞ –∏ –≤—Ä–µ–º—è
            date_elem = block.find('span', class_='date-created')
            time_elem = block.find('span', class_='time-created')
            date_created = date_elem.get_text(strip=True) if date_elem else ""
            time_created = time_elem.get_text(strip=True) if time_elem else ""
            
            # –ó–∞–≥–æ–ª–æ–≤–æ–∫ –∏ —Ç–µ–∫—Å—Ç –ø—Ä–µ–≤—å—é
            title_elem = block.find('div', class_='reviewTitle')
            title = title_elem.get_text(strip=True) if title_elem else ""
            
            teaser_elem = block.find('span', class_='reviewTeaserText')
            teaser_text = teaser_elem.get_text(strip=True) if teaser_elem else ""
            
            # –°—Å—ã–ª–∫–∞ –Ω–∞ –ø–æ–ª–Ω—ã–π –æ—Ç–∑—ã–≤
            link_elem = block.find('a', class_='reviewTextSnippet')
            review_url = urljoin(self.base_url, link_elem['href']) if link_elem and link_elem.get('href') else ""
            
            return {
                'product_name': product_name,
                'author': author_name,
                'rating': rating,
                'date_created': date_created,
                'time_created': time_created,
                'title': title,
                'teaser_text': teaser_text,
                'review_url': review_url,
                'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            
        except Exception as e:
            logging.error(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–∞—Ä—Å–∏–Ω–≥–µ –ø—Ä–µ–≤—å—é –æ—Ç–∑—ã–≤–∞: {e}")
            return None

    def parse_full_review(self, url):
        """–ü–∞—Ä—Å–∏–Ω–≥ –ø–æ–ª–Ω–æ–π –≤–µ—Ä—Å–∏–∏ –æ—Ç–∑—ã–≤–∞"""
        html = self.get_page(url)
        if not html:
            return None
            
        soup = BeautifulSoup(html, 'html.parser')
        review_block = soup.find('div', class_='reviewBlock')
        
        if not review_block:
            logging.warning(f"–ù–µ –Ω–∞–π–¥–µ–Ω –±–ª–æ–∫ –æ—Ç–∑—ã–≤–∞ –¥–ª—è {url}")
            return None
            
        try:
            # –ü–æ–ª–Ω—ã–π —Ç–µ–∫—Å—Ç –æ—Ç–∑—ã–≤–∞
            review_body = review_block.find('div', itemprop='reviewBody')
            full_text = self.clean_text(review_body) if review_body else ""
            
            # –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è
            experience = self.extract_experience(review_block)
            pluses = self.extract_pluses(review_block)
            minuses = self.extract_minuses(review_block)
            verdict = self.extract_verdict(review_block)
            
            # –†–µ–π—Ç–∏–Ω–≥ –∏–∑ –º–µ—Ç–∞-—Ç–µ–≥–∞
            rating_meta = review_block.find('meta', itemprop='ratingValue')
            rating = rating_meta['content'] if rating_meta else ""
            
            return {
                'full_text': full_text,
                'experience': experience,
                'pluses': ' | '.join(pluses),
                'minuses': ' | '.join(minuses),
                'verdict': verdict,
                'detailed_rating': rating
            }
            
        except Exception as e:
            logging.error(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–∞—Ä—Å–∏–Ω–≥–µ –ø–æ–ª–Ω–æ–≥–æ –æ—Ç–∑—ã–≤–∞ {url}: {e}")
            return None

    def extract_rating(self, block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ —Ä–µ–π—Ç–∏–Ω–≥–∞ –∏–∑ –∑–≤–µ–∑–¥"""
        try:
            rating_elem = block.find('div', class_='starsRating')
            if rating_elem:
                # –ò—â–µ–º –∫–ª–∞—Å—Å —Å —Ä–µ–π—Ç–∏–Ω–≥–æ–º
                for cls in rating_elem.get('class', []):
                    if 'fivestarWidgetStatic-' in cls:
                        return cls.split('-')[-1]
                
                # –°—á–∏—Ç–∞–µ–º –∑–∞–ø–æ–ª–Ω–µ–Ω–Ω—ã–µ –∑–≤–µ–∑–¥—ã
                stars = rating_elem.find_all('div', class_='star')
                filled = sum(1 for star in stars if star.find('div', class_='on'))
                return str(filled)
        except Exception as e:
            logging.warning(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –∏–∑–≤–ª–µ—á–µ–Ω–∏–∏ —Ä–µ–π—Ç–∏–Ω–≥–∞: {e}")
        
        return "0"

    def extract_experience(self, review_block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –æ–ø—ã—Ç–∞ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è"""
        try:
            experience_elem = review_block.find('div', class_='item-data')
            return experience_elem.get_text(strip=True) if experience_elem else ""
        except:
            return ""

    def extract_pluses(self, review_block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –¥–æ—Å—Ç–æ–∏–Ω—Å—Ç–≤"""
        try:
            plus_block = review_block.find('div', class_='plus')
            if plus_block:
                plus_items = plus_block.find_all('li')
                return [item.get_text(strip=True) for item in plus_items]
        except:
            pass
        return []

    def extract_minuses(self, review_block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –Ω–µ–¥–æ—Å—Ç–∞—Ç–∫–æ–≤"""
        try:
            minus_block = review_block.find('div', class_='minus')
            if minus_block:
                minus_items = minus_block.find_all('li')
                return [item.get_text(strip=True) for item in minus_items]
        except:
            pass
        return []

    def extract_verdict(self, review_block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –≤–µ—Ä–¥–∏–∫—Ç–∞ (—Ä–µ–∫–æ–º–µ–Ω–¥—É–µ—Ç/–Ω–µ —Ä–µ–∫–æ–º–µ–Ω–¥—É–µ—Ç)"""
        try:
            verdict_elem = review_block.find('span', class_='verdict')
            return verdict_elem.get_text(strip=True) if verdict_elem else ""
        except:
            return ""

    def clean_text(self, element):
        """–û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞ –æ—Ç HTML —Ç–µ–≥–æ–≤"""
        if not element:
            return ""
        
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º –ø–µ—Ä–µ–Ω–æ—Å—ã —Å—Ç—Ä–æ–∫
        for br in element.find_all("br"):
            br.replace_with("\n")
        
        text = element.get_text(separator='\n')
        
        # –û—á–∏—Å—Ç–∫–∞ –æ—Ç –ª–∏—à–Ω–∏—Ö –ø—Ä–æ–±–µ–ª–æ–≤
        lines = [line.strip() for line in text.split('\n')]
        lines = [line for line in lines if line]
        
        return '\n'.join(lines)

    def scrape_reviews(self, start_url, pages=5):
        """–û—Å–Ω–æ–≤–Ω–æ–π –º–µ—Ç–æ–¥ –¥–ª—è —Å–±–æ—Ä–∞ –æ—Ç–∑—ã–≤–æ–≤"""
        all_reviews = []
        
        for page in range(pages):
            if page == 0:
                url = start_url
            else:
                url = f"{start_url}?page={page}"
            
            logging.info(f"–û–±—Ä–∞–±–æ—Ç–∫–∞ —Å—Ç—Ä–∞–Ω–∏—Ü—ã {page + 1}: {url}")
            
            html = self.get_page(url)
            if not html:
                logging.warning(f"–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å —Å—Ç—Ä–∞–Ω–∏—Ü—É {page + 1}")
                continue
            
            # –ü–∞—Ä—Å–∏–º –æ—Ç–∑—ã–≤—ã —Å–æ —Å—Ç—Ä–∞–Ω–∏—Ü—ã —Å–ø–∏—Å–∫–∞
            previews = self.parse_reviews_from_list(html)
            
            for i, preview in enumerate(previews, 1):
                logging.info(f"–û–±—Ä–∞–±–æ—Ç–∫–∞ –æ—Ç–∑—ã–≤–∞ {i}/{len(previews)}: {preview['title'][:30]}...")
                
                # –ü–æ–ª—É—á–∞–µ–º –ø–æ–ª–Ω—ã–π —Ç–µ–∫—Å—Ç –æ—Ç–∑—ã–≤–∞
                full_data = self.parse_full_review(preview['review_url'])
                
                if full_data:
                    # –û–±—ä–µ–¥–∏–Ω—è–µ–º –¥–∞–Ω–Ω—ã–µ
                    complete_review = {**preview, **full_data}
                    all_reviews.append(complete_review)
                else:
                    # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ö–æ—Ç—è –±—ã –ø—Ä–µ–≤—å—é
                    preview['full_text'] = preview.get('teaser_text', '')
                    all_reviews.append(preview)
                
                # –ó–∞–¥–µ—Ä–∂–∫–∞ –º–µ–∂–¥—É –∑–∞–ø—Ä–æ—Å–∞–º–∏
                time.sleep(random.uniform(1, 3))
            
            logging.info(f"–°—Ç—Ä–∞–Ω–∏—Ü–∞ {page + 1} –æ–±—Ä–∞–±–æ—Ç–∞–Ω–∞. –í—Å–µ–≥–æ –æ—Ç–∑—ã–≤–æ–≤: {len(all_reviews)}")
            
            # –ó–∞–¥–µ—Ä–∂–∫–∞ –º–µ–∂–¥—É —Å—Ç—Ä–∞–Ω–∏—Ü–∞–º–∏
            if page < pages - 1:
                time.sleep(random.uniform(2, 4))
        
        return all_reviews

    def save_to_csv(self, reviews, filename):
        """–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤ –≤ CSV —Ñ–∞–π–ª"""
        if not reviews:
            logging.error("–ù–µ—Ç –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è")
            return False
            
        try:
            # –û–ø—Ä–µ–¥–µ–ª—è–µ–º –≤—Å–µ –≤–æ–∑–º–æ–∂–Ω—ã–µ –ø–æ–ª—è
            fieldnames = [
                'product_name', 'author', 'rating', 'date_created', 'time_created',
                'title', 'teaser_text', 'full_text', 'experience', 'pluses', 
                'minuses', 'verdict', 'detailed_rating', 'review_url', 'scraped_at'
            ]
            
            with open(filename, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                
                for review in reviews:
                    # –ó–∞–ø–∏—Å—ã–≤–∞–µ–º —Ç–æ–ª—å–∫–æ —Å—É—â–µ—Å—Ç–≤—É—é—â–∏–µ –ø–æ–ª—è
                    row = {field: review.get(field, '') for field in fieldnames}
                    writer.writerow(row)
            
            logging.info(f"–£—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–æ {len(reviews)} –æ—Ç–∑—ã–≤–æ–≤ –≤ {filename}")
            return True
            
        except Exception as e:
            logging.error(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–∏ –≤ CSV: {e}")
            return False

# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
def main():
    # URL —Å—Ç—Ä–∞–Ω–∏—Ü—ã —Å –æ—Ç–∑—ã–≤–∞–º–∏ (–∑–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ –∞–∫—Ç—É–∞–ª—å–Ω—ã–π)
    START_URL = "https://irecommend.ru/catalog/reviews/939-13393"
    
    parser = ReviewParser()
    
    logging.info("–ù–∞—á–∞–ª–æ –ø–∞—Ä—Å–∏–Ω–≥–∞ –æ—Ç–∑—ã–≤–æ–≤...")
    
    # –°–æ–±–∏—Ä–∞–µ–º –æ—Ç–∑—ã–≤—ã (–Ω–∞–ø—Ä–∏–º–µ—Ä, —Å 3 —Å—Ç—Ä–∞–Ω–∏—Ü)
    reviews = parser.scrape_reviews(START_URL, pages=1)
    
    if reviews:
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ CSV
        filename = f"reviews_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
        parser.save_to_csv(reviews, filename)
        
        # –í—ã–≤–æ–¥–∏–º —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫—É
        logging.info(f"–°–æ–±—Ä–∞–Ω–æ –æ—Ç–∑—ã–≤–æ–≤: {len(reviews)}")
        logging.info(f"–ü–µ—Ä–≤—ã–π –æ—Ç–∑—ã–≤: {reviews[0]['title']}")
        logging.info(f"–¢–µ–∫—Å—Ç –ø–µ—Ä–≤–æ–≥–æ –æ—Ç–∑—ã–≤–∞: {reviews[0]['full_text'][:100]}...")
    else:
        logging.error("–ù–µ —É–¥–∞–ª–æ—Å—å —Å–æ–±—Ä–∞—Ç—å –æ—Ç–∑—ã–≤—ã")

if __name__ == "__main__":
    main()

2025-10-18 22:32:04,583 - INFO - –ù–∞—á–∞–ª–æ –ø–∞—Ä—Å–∏–Ω–≥–∞ –æ—Ç–∑—ã–≤–æ–≤...
2025-10-18 22:32:04,586 - INFO - –û–±—Ä–∞–±–æ—Ç–∫–∞ —Å—Ç—Ä–∞–Ω–∏—Ü—ã 1: https://irecommend.ru/catalog/reviews/939-13393


KeyboardInterrupt: 

In [4]:
import pandas as pd

In [41]:
df = pd.read_csv('reviews_20251018_2220.csv')
df.head(2)

Unnamed: 0,product_name,author,rating,date_created,time_created,title,teaser_text,full_text,experience,pluses,minuses,verdict,detailed_rating,review_url,scraped_at
0,–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ Twister –ö–æ–ª–±–∞—Å–∫–∏ –≥—Ä–∏–ª—å —Å –≥–æ...,–°–∞–Ω–¥—É –ú–∞–¥–∞–Ω,3,18.10.2025,18:04,–ü–µ—Ä–µ–±–æ—Ä —Å –æ—Å—Ç—Ä–æ—Ç–æ–π,"–Ø –µ–ª —ç—Ç–∏ —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ, –µ—â–µ –≥–æ–¥–∞ 2 –Ω–∞–∑–∞–¥ —Å...","–Ø –µ–ª —ç—Ç–∏ —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ, –µ—â–µ –≥–æ–¥–∞ 2 –Ω–∞–∑–∞–¥ —Å...",–≥–æ–¥ –∏–ª–∏ –±–æ–ª–µ–µ,–°—Ç–æ–∏–º–æ—Å—Ç—å,–°–ª–∏—à–∫–æ–º –æ—Å—Ç—Ä—ã–µ | –•–∏–º–æ–∑–Ω–æ–µ –ø–æ—Å–ª–µ–≤–∫—É—Å–∏–µ,–Ω–µ —Ä–µ–∫–æ–º–µ–Ω–¥—É–µ—Ç,3.0,https://irecommend.ru/content/perebor-s-ostrotoi,2025-10-18 22:15:41
1,"–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ Lays ""–û–ª–∏–≤—å–µ —Å –ø–µ—Ä–µ–ø–µ–ª–∫–æ–π""",Olga Bogdanova,4,17.10.2025,18:14,"–í –ø–∞—á–∫–∞—Ö Lay's –∑–∞–ø–∞—Ö–ª–æ –ù–æ–≤—ã–º –≥–æ–¥–æ–ºüéÑ‚õÑ, –ø—Ä–æ–±—É—é –Ω...",–ü—Ä–∏–≤–µ—Ç—Å—Ç–≤—É—é –≤—Å–µ—Ö –ù–∞ —É–ª–∏—Ü–∞—Ö –µ—â—ë –Ω–µ –∑–∞–∫–æ–Ω—á–∏–ª—Å—è –∑...,–ü—Ä–∏–≤–µ—Ç—Å—Ç–≤—É—é –≤—Å–µ—Ö\nüëã\n–ù–∞ —É–ª–∏—Ü–∞—Ö –µ—â—ë –Ω–µ –∑–∞–∫–æ–Ω—á–∏–ª...,149 —Ä—É–±.,–í –º–µ—Ä—É —Å–æ–ª–µ–Ω—ã–µ | –ï—Å—Ç—å —Å—Ö–æ–∂–µ—Å—Ç—å –≤–æ –≤–∫—É—Å–µ —Å –æ–ª–∏–≤...,–ú–∞–ª–æ –º—è—Å–Ω–æ–π –∞—Ä–æ–º–∞—Ç–∏–∫–∏ | –ù–µ–æ–±—ã—á–Ω—ã–π –≤–∫—É—Å,—Ä–µ–∫–æ–º–µ–Ω–¥—É–µ—Ç,4.0,https://irecommend.ru/content/v-pachkakh-lays-...,2025-10-18 22:15:41


In [11]:
!pip install psycopg2

Collecting psycopg2
  Obtaining dependency information for psycopg2 from https://files.pythonhosted.org/packages/88/5a/18c8cb13fc6908dc41a483d2c14d927a7a3f29883748747e8cb625da6587/psycopg2-2.9.11-cp313-cp313-win_amd64.whl.metadata
  Downloading psycopg2-2.9.11-cp313-cp313-win_amd64.whl.metadata (5.1 kB)
Downloading psycopg2-2.9.11-cp313-cp313-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB 217.8 kB/s eta 0:00:13
    --------------------------------------- 0.0/2.7 MB 306.8 kB/s eta 0:00:09
   - -------------------------------------- 0.1/2.7 MB 660.7 kB/s eta 0:00:04
   --- ------------------------------------ 0.3/2.7 MB 1.2 MB/s eta 0:00:03
   ------- -------------------------------- 0.5/2.7 MB 1.9 MB/s eta 0:00:02
   ------------- -------------------------- 0.9/2.7 MB 2.9 MB/s eta 0:00:01
   ---------------------- ------


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [40]:
import psycopg2

In [44]:
from sqlalchemy import create_engine
import pandas as pd

# Create SQLAlchemy engine
engine = create_engine('postgresql+psycopg2://airflow:airflow@localhost:5433/airflow')

df['combined_created'] = pd.to_datetime(df['date_created'] + ' ' + df['time_created'])
display(df.head(2))

# Now use the engine with to_sql
df.to_sql(
    name='reviews',
    schema='parser', 
    con=engine,
    if_exists='append',
    index=False
)

  df['combined_created'] = pd.to_datetime(df['date_created'] + ' ' + df['time_created'])


Unnamed: 0,product_name,author,rating,date_created,time_created,title,teaser_text,full_text,experience,pluses,minuses,verdict,detailed_rating,review_url,scraped_at,combined_created
0,–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ Twister –ö–æ–ª–±–∞—Å–∫–∏ –≥—Ä–∏–ª—å —Å –≥–æ...,–°–∞–Ω–¥—É –ú–∞–¥–∞–Ω,3,18.10.2025,18:04,–ü–µ—Ä–µ–±–æ—Ä —Å –æ—Å—Ç—Ä–æ—Ç–æ–π,"–Ø –µ–ª —ç—Ç–∏ —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ, –µ—â–µ –≥–æ–¥–∞ 2 –Ω–∞–∑–∞–¥ —Å...","–Ø –µ–ª —ç—Ç–∏ —á–∏–ø—Å—ã –æ—á–µ–Ω—å –¥–æ–ª–≥–æ, –µ—â–µ –≥–æ–¥–∞ 2 –Ω–∞–∑–∞–¥ —Å...",–≥–æ–¥ –∏–ª–∏ –±–æ–ª–µ–µ,–°—Ç–æ–∏–º–æ—Å—Ç—å,–°–ª–∏—à–∫–æ–º –æ—Å—Ç—Ä—ã–µ | –•–∏–º–æ–∑–Ω–æ–µ –ø–æ—Å–ª–µ–≤–∫—É—Å–∏–µ,–Ω–µ —Ä–µ–∫–æ–º–µ–Ω–¥—É–µ—Ç,3.0,https://irecommend.ru/content/perebor-s-ostrotoi,2025-10-18 22:15:41,2025-10-18 18:04:00
1,"–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ Lays ""–û–ª–∏–≤—å–µ —Å –ø–µ—Ä–µ–ø–µ–ª–∫–æ–π""",Olga Bogdanova,4,17.10.2025,18:14,"–í –ø–∞—á–∫–∞—Ö Lay's –∑–∞–ø–∞—Ö–ª–æ –ù–æ–≤—ã–º –≥–æ–¥–æ–ºüéÑ‚õÑ, –ø—Ä–æ–±—É—é –Ω...",–ü—Ä–∏–≤–µ—Ç—Å—Ç–≤—É—é –≤—Å–µ—Ö –ù–∞ —É–ª–∏—Ü–∞—Ö –µ—â—ë –Ω–µ –∑–∞–∫–æ–Ω—á–∏–ª—Å—è –∑...,–ü—Ä–∏–≤–µ—Ç—Å—Ç–≤—É—é –≤—Å–µ—Ö\nüëã\n–ù–∞ —É–ª–∏—Ü–∞—Ö –µ—â—ë –Ω–µ –∑–∞–∫–æ–Ω—á–∏–ª...,149 —Ä—É–±.,–í –º–µ—Ä—É —Å–æ–ª–µ–Ω—ã–µ | –ï—Å—Ç—å —Å—Ö–æ–∂–µ—Å—Ç—å –≤–æ –≤–∫—É—Å–µ —Å –æ–ª–∏–≤...,–ú–∞–ª–æ –º—è—Å–Ω–æ–π –∞—Ä–æ–º–∞—Ç–∏–∫–∏ | –ù–µ–æ–±—ã—á–Ω—ã–π –≤–∫—É—Å,—Ä–µ–∫–æ–º–µ–Ω–¥—É–µ—Ç,4.0,https://irecommend.ru/content/v-pachkakh-lays-...,2025-10-18 22:15:41,2025-10-17 18:14:00


40

In [68]:
df1 = pd.read_sql(sql = "select max(combined_created) from parser.reviews", con = engine)

In [69]:
df1

Unnamed: 0,max
0,2025-10-18 18:04:00


In [67]:
data = df1['max'].iloc[0]
data

Timestamp('2025-10-18 18:04:00')

In [54]:
import numpy as np

In [33]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import logging
import random
from urllib.parse import urljoin
from datetime import datetime

# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('irecommend_parser.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)

class IRecommendCacheParser:
    def __init__(self, base_url="https://irecommend.ru"):
        self.base_url = base_url
        self.session = requests.Session()
        self.setup_session()
        
    def setup_session(self):
        """–ù–∞—Å—Ç—Ä–æ–π–∫–∞ —Å–µ—Å—Å–∏–∏"""
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
        })

    def get_through_cached_services(self, url):
        """–ü–æ–ª—É—á–µ–Ω–∏–µ —Å—Ç—Ä–∞–Ω–∏—Ü—ã —á–µ—Ä–µ–∑ –∫—ç—à–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Å–µ—Ä–≤–∏—Å—ã"""
        cached_services = [
            self._try_google_cache,
            self._try_archive_org,
        ]
        
        for service in cached_services:
            html = service(url)
            if html:
                return html
            time.sleep(random.uniform(2, 5))
        
        return None

    def _try_google_cache(self, url):
        """–ü–æ–ø—ã—Ç–∫–∞ —á–µ—Ä–µ–∑ Google Cache"""
        try:
            cache_url = f"https://webcache.googleusercontent.com/search?q=cache:{url}"
            response = self.session.get(cache_url, timeout=15)
            if response.status_code == 200 and self._validate_content(response.text):
                logging.info("‚úÖ –£—Å–ø–µ—Ö —á–µ—Ä–µ–∑ Google Cache")
                return response.text
        except Exception as e:
            logging.warning(f"Google Cache –Ω–µ —Å—Ä–∞–±–æ—Ç–∞–ª: {e}")
        return None

    def _try_archive_org(self, url):
        """–ü–æ–ø—ã—Ç–∫–∞ —á–µ—Ä–µ–∑ Archive.org"""
        try:
            # –ü—Ä–æ–±—É–µ–º —Ä–∞–∑–Ω—ã–µ –¥–∞—Ç—ã –¥–ª—è –ø–æ–∏—Å–∫–∞ –∞–∫—Ç—É–∞–ª—å–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
            dates = ["20241019", "20241018", "20241015", "20241010", "20241001"]
            for date in dates:
                archive_url = f"https://web.archive.org/web/{date}/{url}"
                response = self.session.get(archive_url, timeout=15)
                if response.status_code == 200 and self._validate_content(response.text):
                    logging.info(f"‚úÖ –£—Å–ø–µ—Ö —á–µ—Ä–µ–∑ Archive.org ({date})")
                    return response.text
        except Exception as e:
            logging.warning(f"Archive.org –Ω–µ —Å—Ä–∞–±–æ—Ç–∞–ª: {e}")
        return None

    def _validate_content(self, html):
        """–ü—Ä–æ–≤–µ—Ä–∫–∞ —á—Ç–æ –∫–æ–Ω—Ç–µ–Ω—Ç —Å–æ–¥–µ—Ä–∂–∏—Ç –æ—Ç–∑—ã–≤—ã"""
        required_elements = ['smTeaser', 'productName', 'reviewTitle']
        return any(element in html for element in required_elements)

    def parse_reviews_list(self, html):
        """–ü–∞—Ä—Å–∏–Ω–≥ —Å–ø–∏—Å–∫–∞ –æ—Ç–∑—ã–≤–æ–≤"""
        if not html:
            return []
            
        soup = BeautifulSoup(html, 'html.parser')
        reviews = []
        
        # –ò—â–µ–º –±–ª–æ–∫–∏ —Å –æ—Ç–∑—ã–≤–∞–º–∏
        review_blocks = soup.find_all('div', class_='smTeaser')
        
        for block in review_blocks:
            review_data = self.parse_review_block(block)
            if review_data:
                reviews.append(review_data)
        
        logging.info(f"üìù –ù–∞–π–¥–µ–Ω–æ –æ—Ç–∑—ã–≤–æ–≤: {len(reviews)}")
        return reviews

    def parse_review_block(self, block):
        """–ü–∞—Ä—Å–∏–Ω–≥ –æ–¥–Ω–æ–≥–æ –±–ª–æ–∫–∞ –æ—Ç–∑—ã–≤–∞"""
        try:
            # –ù–∞–∑–≤–∞–Ω–∏–µ –ø—Ä–æ–¥—É–∫—Ç–∞
            product_elem = block.find('div', class_='productName')
            product_name = product_elem.get_text(strip=True) if product_elem else "–ù–µ–∏–∑–≤–µ—Å—Ç–Ω—ã–π –ø—Ä–æ–¥—É–∫—Ç"
            
            # –ê–≤—Ç–æ—Ä
            author_elem = block.find('div', class_='authorName')
            author_name = author_elem.get_text(strip=True) if author_elem else "–ê–Ω–æ–Ω–∏–º"
            
            # –†–µ–π—Ç–∏–Ω–≥
            rating = self.extract_rating_from_block(block)
            
            # –î–∞—Ç–∞ –∏ –≤—Ä–µ–º—è
            date_elem = block.find('span', class_='date-created')
            time_elem = block.find('span', class_='time-created')
            date_created = date_elem.get_text(strip=True) if date_elem else ""
            time_created = time_elem.get_text(strip=True) if time_elem else ""
            
            # –ó–∞–≥–æ–ª–æ–≤–æ–∫
            title_elem = block.find('div', class_='reviewTitle')
            title = title_elem.get_text(strip=True) if title_elem else ""
            
            # –¢–µ–∫—Å—Ç –ø—Ä–µ–≤—å—é
            teaser_elem = block.find('span', class_='reviewTeaserText')
            teaser_text = teaser_elem.get_text(strip=True) if teaser_elem else ""
            
            # –°—Å—ã–ª–∫–∞ –Ω–∞ –ø–æ–ª–Ω—ã–π –æ—Ç–∑—ã–≤
            link_elem = block.find('a', class_='reviewTextSnippet')
            review_url = urljoin(self.base_url, link_elem['href']) if link_elem and link_elem.get('href') else ""
            
            return {
                'product_name': product_name,
                'author': author_name,
                'rating': rating,
                'date_created': date_created,
                'time_created': time_created,
                'title': title,
                'teaser_text': teaser_text,
                'review_url': review_url,
                'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            
        except Exception as e:
            logging.error(f"–û—à–∏–±–∫–∞ –ø–∞—Ä—Å–∏–Ω–≥–∞ –±–ª–æ–∫–∞: {e}")
            return None

    def extract_rating_from_block(self, block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ —Ä–µ–π—Ç–∏–Ω–≥–∞"""
        try:
            rating_elem = block.find('div', class_='starsRating')
            if rating_elem:
                # –ò—â–µ–º –∫–ª–∞—Å—Å —Å —Ä–µ–π—Ç–∏–Ω–≥–æ–º –≤ —Ñ–æ—Ä–º–∞—Ç–µ fivestarWidgetStatic-X
                for cls in rating_elem.get('class', []):
                    if 'fivestarWidgetStatic-' in cls:
                        return cls.split('-')[-1]
                
                # –ê–ª—å—Ç–µ—Ä–Ω–∞—Ç–∏–≤–Ω—ã–π –º–µ—Ç–æ–¥: —Å—á–∏—Ç–∞–µ–º –∑–∞–ø–æ–ª–Ω–µ–Ω–Ω—ã–µ –∑–≤–µ–∑–¥—ã
                stars = rating_elem.find_all('div', class_='star')
                filled_stars = sum(1 for star in stars if star.find('div', class_='on'))
                return str(filled_stars)
        except Exception as e:
            logging.warning(f"–û—à–∏–±–∫–∞ –∏–∑–≤–ª–µ—á–µ–Ω–∏—è —Ä–µ–π—Ç–∏–Ω–≥–∞: {e}")
        return "0"

    def parse_full_review(self, url):
        """–ü–∞—Ä—Å–∏–Ω–≥ –ø–æ–ª–Ω–æ–≥–æ –æ—Ç–∑—ã–≤–∞"""
        if not url:
            return {}
            
        logging.info(f"üîç –ü–∞—Ä—Å–∏–º –ø–æ–ª–Ω—ã–π –æ—Ç–∑—ã–≤: {url}")
        html = self.get_through_cached_services(url)
        
        if not html:
            logging.warning(f"–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –ø–æ–ª–Ω—ã–π –æ—Ç–∑—ã–≤: {url}")
            return {}
            
        soup = BeautifulSoup(html, 'html.parser')
        review_data = {}
        
        try:
            review_block = soup.find('div', class_='reviewBlock')
            if not review_block:
                return {}
            
            # –ü–æ–ª–Ω—ã–π —Ç–µ–∫—Å—Ç –æ—Ç–∑—ã–≤–∞
            review_body = review_block.find('div', itemprop='reviewBody')
            if review_body:
                review_data['full_text'] = self.clean_text(review_body)
            else:
                review_data['full_text'] = ""
            
            # –û–ø—ã—Ç –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è (—Å—Ç–æ–∏–º–æ—Å—Ç—å –∏ —Ç.–¥.)
            experience_data = self.extract_experience_info(review_block)
            review_data['experience'] = experience_data
            
            # –î–æ—Å—Ç–æ–∏–Ω—Å—Ç–≤–∞
            pluses = self.extract_pluses(review_block)
            review_data['pluses'] = ' | '.join(pluses) if pluses else ""
            
            # –ù–µ–¥–æ—Å—Ç–∞—Ç–∫–∏
            minuses = self.extract_minuses(review_block)
            review_data['minuses'] = ' | '.join(minuses) if minuses else ""
            
            # –í–µ—Ä–¥–∏–∫—Ç
            verdict = self.extract_verdict(review_block)
            review_data['verdict'] = verdict
            
            # –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è
            additional_info = self.extract_additional_info(review_block)
            review_data.update(additional_info)
            
            logging.info(f"‚úÖ –ü–æ–ª–Ω—ã–π –æ—Ç–∑—ã–≤: {len(review_data.get('full_text', ''))} —Å–∏–º–≤–æ–ª–æ–≤")
            
        except Exception as e:
            logging.error(f"–û—à–∏–±–∫–∞ –ø–∞—Ä—Å–∏–Ω–≥–∞ –ø–æ–ª–Ω–æ–≥–æ –æ—Ç–∑—ã–≤–∞: {e}")
        
        return review_data

    def extract_experience_info(self, review_block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –æ–± –æ–ø—ã—Ç–µ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è"""
        try:
            extra_info = review_block.find('div', class_='extraInfo')
            if extra_info:
                item_data = extra_info.find('div', class_='item-data')
                if item_data:
                    return item_data.get_text(strip=True)
        except:
            pass
        return ""

    def extract_pluses(self, review_block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –¥–æ—Å—Ç–æ–∏–Ω—Å—Ç–≤"""
        try:
            plus_block = review_block.find('div', class_='plus')
            if plus_block:
                plus_items = plus_block.find_all('li')
                return [item.get_text(strip=True) for item in plus_items]
        except:
            pass
        return []

    def extract_minuses(self, review_block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –Ω–µ–¥–æ—Å—Ç–∞—Ç–∫–æ–≤"""
        try:
            minus_block = review_block.find('div', class_='minus')
            if minus_block:
                minus_items = minus_block.find_all('li')
                return [item.get_text(strip=True) for item in minus_items]
        except:
            pass
        return []

    def extract_verdict(self, review_block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –≤–µ—Ä–¥–∏–∫—Ç–∞"""
        try:
            conclusion = review_block.find('div', class_='conclusion')
            if conclusion:
                verdict_elem = conclusion.find('span', class_='verdict')
                if verdict_elem:
                    return verdict_elem.get_text(strip=True)
        except:
            pass
        return ""

    def extract_additional_info(self, review_block):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ–π –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏"""
        info = {}
        try:
            # –î–∞—Ç–∞ –ø—É–±–ª–∏–∫–∞—Ü–∏–∏ –∏–∑ –ø–æ–ª–Ω–æ–≥–æ –æ—Ç–∑—ã–≤–∞
            date_elem = review_block.find('span', class_='dtreviewed')
            if date_elem:
                info['full_date'] = date_elem.get_text(strip=True)
            
            # –†–µ–π—Ç–∏–Ω–≥ –∏–∑ –ø–æ–ª–Ω–æ–≥–æ –æ—Ç–∑—ã–≤–∞ (–¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏)
            rating_meta = review_block.find('meta', itemprop='ratingValue')
            if rating_meta:
                info['rating_verified'] = rating_meta.get('content', '')
                
        except:
            pass
        
        return info

    def clean_text(self, element):
        """–û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞"""
        if not element:
            return ""
        
        for br in element.find_all("br"):
            br.replace_with("\n")
        
        text = element.get_text(separator='\n')
        lines = [line.strip() for line in text.split('\n')]
        lines = [line for line in lines if line]
        
        return '\n'.join(lines)

    def scrape_page(self, page_url, max_reviews=10, get_full_reviews=True):
        """–°–±–æ—Ä –æ—Ç–∑—ã–≤–æ–≤ —Å –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–π —Å—Ç—Ä–∞–Ω–∏—Ü—ã"""
        logging.info(f"üöÄ –°–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö —Å–æ —Å—Ç—Ä–∞–Ω–∏—Ü—ã: {page_url}")
        
        # –ü–æ–ª—É—á–∞–µ–º —Å—Ç—Ä–∞–Ω–∏—Ü—É —á–µ—Ä–µ–∑ –∫—ç—à
        html = self.get_through_cached_services(page_url)
        
        if not html:
            logging.error(f"‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å –ø–æ–ª—É—á–∏—Ç—å –¥–∞–Ω–Ω—ã–µ —Å–æ —Å—Ç—Ä–∞–Ω–∏—Ü—ã: {page_url}")
            return []
        
        # –ü–∞—Ä—Å–∏–º —Å–ø–∏—Å–æ–∫ –æ—Ç–∑—ã–≤–æ–≤
        preview_reviews = self.parse_reviews_list(html)
        
        if not preview_reviews:
            logging.warning(f"‚ùå –ù–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page_url} –Ω–µ –Ω–∞–π–¥–µ–Ω–æ –æ—Ç–∑—ã–≤–æ–≤")
            return []
        
        # –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ
        preview_reviews = preview_reviews[:max_reviews]
        
        complete_reviews = []
        successful_full = 0
        
        for i, preview in enumerate(preview_reviews, 1):
            logging.info(f"üîÑ –û–±—Ä–∞–±–æ—Ç–∫–∞ –æ—Ç–∑—ã–≤–∞ {i}/{len(preview_reviews)}: {preview['title'][:50]}...")
            
            if get_full_reviews and preview['review_url']:
                # –ü–æ–ª—É—á–∞–µ–º –ø–æ–ª–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –æ—Ç–∑—ã–≤–∞
                full_data = self.parse_full_review(preview['review_url'])
                
                if full_data:
                    complete_review = {**preview, **full_data}
                    complete_reviews.append(complete_review)
                    successful_full += 1
                else:
                    # –ï—Å–ª–∏ –Ω–µ —É–¥–∞–ª–æ—Å—å –ø–æ–ª—É—á–∏—Ç—å –ø–æ–ª–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ, –∏—Å–ø–æ–ª—å–∑—É–µ–º –ø—Ä–µ–≤—å—é
                    preview['full_text'] = preview.get('teaser_text', '')
                    complete_reviews.append(preview)
            else:
                # –ò—Å–ø–æ–ª—å–∑—É–µ–º —Ç–æ–ª—å–∫–æ –ø—Ä–µ–≤—å—é –¥–∞–Ω–Ω—ã–µ
                preview['full_text'] = preview.get('teaser_text', '')
                complete_reviews.append(preview)
            
            # –ó–∞–¥–µ—Ä–∂–∫–∞ –º–µ–∂–¥—É –æ–±—Ä–∞–±–æ—Ç–∫–æ–π –æ—Ç–∑—ã–≤–æ–≤
            if i < len(preview_reviews):
                time.sleep(random.uniform(2, 4))
        
        logging.info(f"üéâ –°–±–æ—Ä –∑–∞–≤–µ—Ä—à–µ–Ω! –ü–æ–ª–Ω—ã–µ –æ—Ç–∑—ã–≤—ã: {successful_full}/{len(complete_reviews)}")
        return complete_reviews

    def scrape_multiple_pages(self, base_url, pages, max_reviews_per_page=5):
        """–°–±–æ—Ä –æ—Ç–∑—ã–≤–æ–≤ —Å –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö —Å—Ç—Ä–∞–Ω–∏—Ü"""
        all_reviews = []
        
        for page in pages:
            if page == 1:
                page_url = base_url
            else:
                page_url = f"{base_url}?page={page}"
            
            logging.info(f"üìÑ –û–±—Ä–∞–±–æ—Ç–∫–∞ —Å—Ç—Ä–∞–Ω–∏—Ü—ã {page}")
            
            page_reviews = self.scrape_page(page_url, max_reviews_per_page, get_full_reviews=True)
            all_reviews.extend(page_reviews)
            
            # –ó–∞–¥–µ—Ä–∂–∫–∞ –º–µ–∂–¥—É —Å—Ç—Ä–∞–Ω–∏—Ü–∞–º–∏
            if page < pages[-1]:
                time.sleep(random.uniform(5, 10))
        
        # –£–¥–∞–ª—è–µ–º –¥—É–±–ª–∏–∫–∞—Ç—ã (–Ω–∞ —Å–ª—É—á–∞–π –µ—Å–ª–∏ –æ—Ç–∑—ã–≤—ã –ø–æ–≤—Ç–æ—Ä—è—é—Ç—Å—è –Ω–∞ —Ä–∞–∑–Ω—ã—Ö —Å—Ç—Ä–∞–Ω–∏—Ü–∞—Ö)
        unique_reviews = self.remove_duplicates(all_reviews)
        return unique_reviews

    def remove_duplicates(self, reviews):
        """–£–¥–∞–ª–µ–Ω–∏–µ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤ –æ—Ç–∑—ã–≤–æ–≤"""
        seen = set()
        unique = []
        
        for review in reviews:
            # –°–æ–∑–¥–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –∫–ª—é—á –Ω–∞ –æ—Å–Ω–æ–≤–µ –∑–∞–≥–æ–ª–æ–≤–∫–∞ –∏ –∞–≤—Ç–æ—Ä–∞
            key = (review['title'], review['author'])
            if key not in seen:
                seen.add(key)
                unique.append(review)
        
        return unique

    def save_to_csv(self, reviews, filename):
        """–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≤ CSV"""
        if not reviews:
            logging.error("–ù–µ—Ç –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è")
            return False
            
        try:
            fieldnames = [
                'product_name', 'author', 'rating', 'date_created', 'time_created',
                'title', 'teaser_text', 'full_text', 'experience', 'pluses', 
                'minuses', 'verdict', 'full_date', 'rating_verified', 'review_url', 
                'scraped_at'
            ]
            
            with open(filename, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                
                for review in reviews:
                    row = {field: review.get(field, '') for field in fieldnames}
                    writer.writerow(row)
            
            logging.info(f"üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ {len(reviews)} –æ—Ç–∑—ã–≤–æ–≤ –≤ {filename}")
            return True
            
        except Exception as e:
            logging.error(f"–û—à–∏–±–∫–∞ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è: {e}")
            return False

# –§—É–Ω–∫—Ü–∏–∏ –¥–ª—è —Ä–∞–∑–Ω—ã—Ö —Å—Ü–µ–Ω–∞—Ä–∏–µ–≤ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
def scrape_single_page(page_number=1):
    """–°–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö —Å –æ–¥–Ω–æ–π —Å—Ç—Ä–∞–Ω–∏—Ü—ã"""
    parser = IRecommendCacheParser()
    
    if page_number == 1:
        url = "https://irecommend.ru/catalog/reviews/939-13393"
    else:
        url = f"https://irecommend.ru/catalog/reviews/939-13393?page={page_number}"
    
    reviews = parser.scrape_page(url, max_reviews=10, get_full_reviews=True)
    return reviews

def scrape_multiple_pages(page_numbers):
    """–°–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö —Å –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö —Å—Ç—Ä–∞–Ω–∏—Ü"""
    parser = IRecommendCacheParser()
    
    reviews = parser.scrape_multiple_pages(
        base_url="https://irecommend.ru/catalog/reviews/939-13393",
        pages=page_numbers,
        max_reviews_per_page=5
    )
    return reviews

def main():
    """–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ —Ä–∞–∑–Ω—ã—Ö —Å—Ü–µ–Ω–∞—Ä–∏–µ–≤"""
    logging.info("üéØ –ó–∞–ø—É—Å–∫ –ø–∞—Ä—Å–µ—Ä–∞ iRecommend —á–µ—Ä–µ–∑ –∫—ç—à")
    
    # –°—Ü–µ–Ω–∞—Ä–∏–π 1: –û–¥–Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–∞
    # reviews = scrape_single_page(page_number=1)
    
    # –°—Ü–µ–Ω–∞—Ä–∏–π 2: –ù–µ—Å–∫–æ–ª—å–∫–æ —Å—Ç—Ä–∞–Ω–∏—Ü
    #reviews = scrape_multiple_pages(page_numbers=[1, 2, 3])
    
    # –°—Ü–µ–Ω–∞—Ä–∏–π 3: –ö–æ–Ω–∫—Ä–µ—Ç–Ω–∞—è —Å—Ç—Ä–∞–Ω–∏—Ü–∞ (–Ω–∞–ø—Ä–∏–º–µ—Ä, 99)
    reviews = scrape_single_page(page_number=93)
    
    if reviews:
        filename = f"irecommend_reviews_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
        parser = IRecommendCacheParser()
        success = parser.save_to_csv(reviews, filename)
        
        if success:
            logging.info(f"‚úÖ –£—Å–ø–µ—à–Ω–æ —Å–æ–±—Ä–∞–Ω–æ –æ—Ç–∑—ã–≤–æ–≤: {len(reviews)}")
            
            # –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
            full_text_count = sum(1 for r in reviews if r.get('full_text') and len(r['full_text']) > 500)
            pluses_count = sum(1 for r in reviews if r.get('pluses'))
            minuses_count = sum(1 for r in reviews if r.get('minuses'))
            
            logging.info(f"üìä –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞:")
            logging.info(f"   - –ü–æ–ª–Ω—ã–µ —Ç–µ–∫—Å—Ç—ã: {full_text_count}")
            logging.info(f"   - –° –ø–ª—é—Å–∞–º–∏: {pluses_count}") 
            logging.info(f"   - –° –º–∏–Ω—É—Å–∞–º–∏: {minuses_count}")
            
            if reviews:
                sample = reviews[0]
                logging.info(f"üìÑ –ü—Ä–∏–º–µ—Ä –æ—Ç–∑—ã–≤–∞: '{sample['title']}'")
        else:
            logging.error("‚ùå –û—à–∏–±–∫–∞ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –¥–∞–Ω–Ω—ã—Ö")
    else:
        logging.error("‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å —Å–æ–±—Ä–∞—Ç—å –¥–∞–Ω–Ω—ã–µ")

if __name__ == "__main__":
    main()

2025-10-19 23:38:06,763 - INFO - üéØ –ó–∞–ø—É—Å–∫ –ø–∞—Ä—Å–µ—Ä–∞ iRecommend —á–µ—Ä–µ–∑ –∫—ç—à
2025-10-19 23:38:06,784 - INFO - üöÄ –°–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö —Å–æ —Å—Ç—Ä–∞–Ω–∏—Ü—ã: https://irecommend.ru/catalog/reviews/939-13393?page=93
2025-10-19 23:38:21,402 - ERROR - ‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å –ø–æ–ª—É—á–∏—Ç—å –¥–∞–Ω–Ω—ã–µ —Å–æ —Å—Ç—Ä–∞–Ω–∏—Ü—ã: https://irecommend.ru/catalog/reviews/939-13393?page=93
2025-10-19 23:38:21,410 - ERROR - ‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å —Å–æ–±—Ä–∞—Ç—å –¥–∞–Ω–Ω—ã–µ


In [14]:
!pip install cloudscraper fp.free-proxy



ERROR: Could not find a version that satisfies the requirement fp.free-proxy (from versions: none)
ERROR: No matching distribution found for fp.free-proxy

[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd

In [None]:
df = pd.read_csv('irecommend_reviews_20251019_2303.csv')
df.head(1)

Unnamed: 0,product_name,author,rating,date_created,time_created,title,teaser_text,full_text,experience,pluses,minuses,verdict,full_date,rating_verified,review_url,scraped_at
0,"–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ Lays ""–û–ª–∏–≤—å–µ —Å –ø–µ—Ä–µ–ø–µ–ª–∫–æ–π""",–≥–µ–ª–ª–∞ —Ä–∞–Ω—å—à–µ –ø–µ–ª–∞,5,19.10.2025,21:47,–û—Å–µ–Ω–Ω–∏–π –ù–æ–≤—ã–π –≥–æ–¥ —á–∞—Å—Ç—å –≤—Ç–æ—Ä–∞—è. –£–¥–∞–ª–æ—Å—å –ª–∏ Lay...,"–í–°–ï–ú –ü–†–ò–í–ï–¢–´) ¬† –ù–µ —Å–∫–∞–∂—É, —á—Ç–æ —á–∞—Å—Ç–æ –µ–º —á–∏–ø—Å—ã, ...","–í–°–ï–ú –ü–†–ò–í–ï–¢–´)\n–ù–µ —Å–∫–∞–∂—É, —á—Ç–æ —á–∞—Å—Ç–æ –µ–º —á–∏–ø—Å—ã, –µ...",149 —Ä—É–±–ª–µ–π,–í –º–µ—Ä—É —Å–æ–ª–µ–Ω—ã–µ | –ï—Å—Ç—å —Å—Ö–æ–∂–µ—Å—Ç—å –≤–æ –≤–∫—É—Å–µ —Å –æ–ª–∏–≤...,"–í–∫—É—Å —É–ª–µ—Ç—É—á–∏–ª—Å—è –Ω–∞ –∑–∞–≤—Ç—Ä–∞, —Å–ª–æ–≤–Ω–æ –ø—Ä–æ—à–ª—ã–π –≥–æ–¥ ...",—Ä–µ–∫–æ–º–µ–Ω–¥—É–µ—Ç,"19 –û–∫—Ç—è–±—Ä—å, 2025 - 21:47",5,https://irecommend.ru/content/osennii-novyi-go...,2025-10-19 23:02:35


In [31]:
df = pd.read_csv('irecommend_reviews_20251019_2335.csv')
display(df)
df['full_text'][3]


Unnamed: 0,product_name,author,rating,date_created,time_created,title,teaser_text,full_text,experience,pluses,minuses,verdict,full_date,rating_verified,review_url,scraped_at
0,–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ Lava Lava –ö—Ä–∞–±–æ-–Ω–∏–Ω–¥–∑—è,XeniumX,3,,,"–¢–∞–∫ –ª–∏ —Ö–æ—Ä–æ—à–∏ —á–∏–ø—Å—ã –æ—Ç –±—É–º–∞–∂–Ω–æ–≥–æ –±–ª–æ–≥–µ—Ä–∞, –∫–∞–∫ ...","–í—Å–µ–º –ø—Ä–∏–≤–µ—Ç! –Ø –¥—É–º–∞—é, –º–Ω–æ–≥–∏–µ —Ä–æ–¥–∏—Ç–µ–ª–∏, —É –∫–æ–≥–æ ...","–í—Å–µ–º –ø—Ä–∏–≤–µ—Ç! –Ø –¥—É–º–∞—é, –º–Ω–æ–≥–∏–µ —Ä–æ–¥–∏—Ç–µ–ª–∏, —É –∫–æ–≥–æ ...",,,,,,,,2025-10-19 23:34:35
1,–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ Lorenz Crunchips X-Cut —Å–º–µ—Ç...,katerina_5512,5,,,–û—á–µ–Ω—å –∫–∞—á–µ—Å—Ç–≤–µ–Ω–Ω—ã–µ —á–∏–ø—Å—ã –ø–æ –Ω–µ–≤—ã—Å–æ–∫–æ–π —Ü–µ–Ω–µ!,–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ —Ä–∏—Ñ–ª–µ–Ω—ã–µ ¬´Crunchips X-Cut¬ª ...,–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ —Ä–∏—Ñ–ª–µ–Ω—ã–µ ¬´Crunchips X-Cut¬ª ...,,,,,,,,2025-10-19 23:34:35
2,–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ Lays –ú–æ—Ü–∞—Ä–µ–ª–ª–∞ —Å –ø–µ—Å—Ç–æ,katerina_5512,5,,,–í–∫—É—Å—ã —á–∏–ø—Å–æ–≤ —Å –∫–∞–∂–¥—ã–º —Ä–∞–∑–æ–º —Å—Ç–∞–Ω–æ–≤—è—Ç—Å—è –º–µ–Ω–µ–µ –±...,–ß–∏–ø—Å—ã –∏–∑ –Ω–∞—Ç—É—Ä–∞–ª—å–Ω–æ–≥–æ –∫–∞—Ä—Ç–æ—Ñ–µ–ª—è Lays —Å–æ –≤–∫—É—Å–æ–º...,–ß–∏–ø—Å—ã –∏–∑ –Ω–∞—Ç—É—Ä–∞–ª—å–Ω–æ–≥–æ –∫–∞—Ä—Ç–æ—Ñ–µ–ª—è Lays —Å–æ –≤–∫—É—Å–æ–º...,,,,,,,,2025-10-19 23:34:35
3,–ß–∏–ø—Å—ã TWISTER –°—ã—Ä,–ï–∫–∞—Ç–µ—Ä–∏–Ω–∞1703niz,5,,,–•—Ä—É—Å—Ç—è—â–∏–µ –∏ –ª–µ–≥–∫–∏–µ —Å—ã—Ä–Ω—ã–µ —á–∏–ø—Å—ãüßÄ,–í—Å–µ–º –ø—Ä–∏–≤–µ—Ç! –ò–Ω–æ–≥–¥–∞ –æ—á–µ–Ω—å —Ö–æ—á–µ—Ç—Å—è —á–µ–º-–Ω–∏–±—É–¥—å –ø...,–í—Å–µ–º –ø—Ä–∏–≤–µ—Ç! –ò–Ω–æ–≥–¥–∞ –æ—á–µ–Ω—å —Ö–æ—á–µ—Ç—Å—è —á–µ–º-–Ω–∏–±—É–¥—å –ø...,,,,,,,,2025-10-19 23:34:35
4,–ß–∏–ø—Å—ã –∫–∞—Ä—Ç–æ—Ñ–µ–ª—å–Ω—ã–µ Lays –†–∏—Ñ–ª–µ–Ω—ã–µ –û—Å—Ç—Ä—ã–µ –∫—Ä—ã–ª—ã—à–∫–∏,katerina_5512,5,,,–ù–∞—Å–∫–æ–ª—å–∫–æ —É–¥–∞—á–Ω–∞ –Ω–æ–≤–∏–Ω–∫–∞?,–ß–∏–ø—Å—ã –∏–∑ –Ω–∞—Ç—É—Ä–∞–ª—å–Ω–æ–≥–æ –∫–∞—Ä—Ç–æ—Ñ–µ–ª—è Lays —Ä–∏—Ñ–ª–µ–Ω—ã–µ ...,–ß–∏–ø—Å—ã –∏–∑ –Ω–∞—Ç—É—Ä–∞–ª—å–Ω–æ–≥–æ –∫–∞—Ä—Ç–æ—Ñ–µ–ª—è Lays —Ä–∏—Ñ–ª–µ–Ω—ã–µ ...,,,,,,,,2025-10-19 23:34:35


'–í—Å–µ–º –ø—Ä–∏–≤–µ—Ç! –ò–Ω–æ–≥–¥–∞ –æ—á–µ–Ω—å —Ö–æ—á–µ—Ç—Å—è —á–µ–º-–Ω–∏–±—É–¥—å –ø–æ—Ö—Ä—É—Å—Ç–µ—Ç—å, —á–∏–ø—Å–∞–º–∏ –∫–∞–∫–∏–º–∏-–Ω–∏–±—É–¥—å –≤–∫—É—Å–Ω—ã–º–∏ –ü–æ—Å–ª–µ–¥–Ω–µ–µ –ø—Ä–∏–æ–±—Ä–µ—Ç–µ–Ω–∏–µ —ç—Ç–æ —á–∏–ø—Å—ã - TWISTER —Å–æ –≤–∫—É—Å–æ–º —Å—ã—Ä–∞ –æ—Ç –ú–æ—Å–∫–æ–≤—Å–∫–æ–≥–æ –∫–∞—Ä—Ç–æ—Ñ–µ–ª—è.–ü—Ä–æ–¥–∞—é—Ç—Å—è –¥–∞–Ω–Ω—ã–µ —á–∏–ø—Å—ã –≤–æ –≤—Å–µ—Ö —Å—É–ø–µ—Ä–º–∞—Ä–∫–µ—Ç–∞—Ö –∏ –æ–±—ã—á–Ω—ã—Ö –º–∞–≥–∞–∑–∏–Ω–∞—Ö, —Ü–µ–Ω–∞ –∑–∞ —É–ø–∞–∫–æ–≤–∫—É 70 –≥—Ä–∞–º–º –≤ —Ä–∞–π–æ–Ω–µ 60 —Ä—É–±–ª–µ–π.'