In [5]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from bs4 import BeautifulSoup
import json

In [6]:
input_url = "https://fr.trustpilot.com/review/"
extension = '.eu'

scrap_tag = {'card': 'div', 'title': 'h2', 'rating': 'div', 'review': 'p', 'date': 'p', 'reply': 'p'}
scrap_class = {'card': "styles_cardWrapper__LcCPA", 'title': 'typography_heading-s__f7029',
               'rating': 'styles_reviewHeader__iU9Px', 'review': 'typography_body-l__KUYFJ',
               'date': 'typography_body-m__xgxZ_', 'reply': 'typography_body-m__xgxZ_'}

matching_rule = {'1': '.\nJe ne suis pas content du tout.\n',
                 '2': '.\nJe ne suis pas content.\n',
                 '3': '.\nJe suis partagé.\n',
                 '4': '.\nJe suis content.\n',
                 '5': '.\nJe suis très content.\n'}

reactivity = ['réactivité', 'rapidité', 'disponibilité', 'joindre', 'joignable', 'disponible', 'instantané',
              'instantanément', 'rapidement', 'répond', 'réponse', 'compliqué', 'difficulté', 'difficile', 'long',
              'pénible', 'simple', 'sérieux',
              'facile', 'efficace', 'fluide', 'procédure', 'toujours pas']
price = ['compétitif', 'cher', 'prix', 'tarif']
offer = ['remboursement', 'règlement', 'sinistre', 'vol', 'déclaration']
labels = reactivity + price + offer

In [3]:
class NlpPipeline:

    def __init__(self):
        self.classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

    def run(self, complete_reviews: [str]):
        results = [self.classifier(cr, labels, multilabel=True) for cr in complete_reviews[:3]]
        # results = self.classifier(complete_reviews[:5], labels, multilabel=True)

        result = results[0]

        _sequence = result['sequence']
        _labels = result['labels']
        _scores = result['scores']

        result = {'review': _sequence}
        result.update({l:s for l,s in zip(_labels, _scores)})

        result = pd.DataFrame.from_dict([result])
        return result

In [8]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
#from ..config import input_url, extension, scrap_tag, scrap_class

input_url = "https://fr.trustpilot.com/review/"
#input_url = "https://fr.trustpilot.com/"
extension = '.eu'

scrap_tag = {'card': 'div', 'title': 'h2', 'rating': 'div', 'review': 'p', 'date': 'p', 'reply': 'p'}
scrap_class = {'card': "styles_cardWrapper__LcCPA", 'title': 'typography_heading-s__f7029',
               'rating': 'styles_reviewHeader__iU9Px', 'review': 'typography_body-l__KUYFJ',
               'date': 'typography_body-m__xgxZ_', 'reply': 'typography_body-m__xgxZ_'}



class ReviewScrapper:

    def __init__(self, company: str, page_max=100):
        url_to_scrap = input_url + company 
        self.url = url_to_scrap
        self.page_max = page_max

    def scrap_data(self):
        titles = []
        ratings = []
        reviews = []
        dates = []
        replies = []
        reply_dates = []
        for index in range(self.number_of_pages):
            url = self.url + "?page=" + str(index + 1)
            print('Scrapping page ',index, ' from url ', url)
            soup = self._get_soup(url)
            _titles, _ratings, _reviews, _dates, _replies, _reply_dates = self._get_content(soup)
            titles += _titles
            ratings += _ratings
            reviews += _reviews
            dates += _dates
            replies += _replies
            reply_dates += _reply_dates
        reviews_dict = {'titles': titles, 'ratings': ratings, 'reviews': reviews,
                        'dates': dates, 'replies': replies, 'reply_dates': reply_dates}
        reviews_df = pd.DataFrame.from_dict(reviews_dict)
        return reviews_df

    @staticmethod
    def _get_soup(url: str) -> BeautifulSoup:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        return soup

    @staticmethod
    def _get_content(soup: BeautifulSoup):

        def _get_review_title(_card: BeautifulSoup):
            paragraphs = _card.findAll(scrap_tag['title'], class_=scrap_class['title'])
            contents = paragraphs[0].contents if paragraphs else []
            review_title = ''
            for line in contents:
                review_title += str(line) if str(line) != '<br/>' else '\n'
            return review_title

        def _get_review_rating(_card: BeautifulSoup):
            paragraphs = _card.findAll(scrap_tag['rating'], class_=scrap_class['rating'])
            paragraph = paragraphs[0] if paragraphs and 0 < len(paragraphs) else '-1'
            rating = paragraph.get('data-service-review-rating')
            return rating

        def _get_review(_card: BeautifulSoup):
            paragraphs = _card.findAll(scrap_tag['review'], class_=scrap_class['review'])
            contents = paragraphs[0].contents if paragraphs else []
            review = ''
            for line in contents:
                review += str(line) if str(line) != '<br/>' else '\n'
            return review

        def _get_review_date(_card: BeautifulSoup):
            paragraphs = _card.findAll('time')
            date = paragraphs[0].get('datetime') if 0 < len(paragraphs) else ''
            return date

        """
        def _get_review_date(_card: BeautifulSoup):
            paragraphs = _card.findAll(scrap_tag['date'], class_=scrap_class['date'])
            contents = paragraphs[0].contents if paragraphs else []
            review_date = contents[-1] if 0 < len(contents) else ''
            return review_date
        """

        def _get_reply(_card: BeautifulSoup):
            paragraphs = _card.findAll(scrap_tag['reply'], class_=scrap_class['reply'])
            reply = ''
            if 1 < len(paragraphs):
                contents = paragraphs[2].contents if paragraphs else []
                if 0 < len(contents):
                    reply = contents[0]
            return reply

        def _get_reply_date(_card: BeautifulSoup):
            paragraphs = _card.findAll('time')
            reply_date = paragraphs[1].get('datetime') if 1 < len(paragraphs) else ''
            return reply_date

        titles, ratings, reviews, dates, replies, reply_dates = [], [], [], [], [], []
        cards = soup.findAll(scrap_tag['card'], class_=scrap_class['card'])
        if cards:
            for card in cards:
                titles.append(_get_review_title(card))
                ratings.append(_get_review_rating(card))
                reviews.append(_get_review(card))
                dates.append(_get_review_date(card))
                replies.append(_get_reply(card))
                reply_dates.append(_get_reply_date(card))
        content = titles, ratings, reviews, dates, replies, reply_dates
        return content

    @property
    def number_of_pages(self):
        num_of_pages = self.page_max
        # @Insaf # These lines of code will throw an error if the url is not as expected
        
        if 1 < self.page_max:
            soup = self._get_soup(self.url)
            content_for_pages = soup.findAll("script", id="__NEXT_DATA__")[0].contents[0]
            content_for_pages_json = json.loads(content_for_pages)
            num_of_pages: int = \
                content_for_pages_json.get('props').get('pageProps').get('filters').get('pagination').get('totalPages')
            num_of_pages = min(num_of_pages, self.page_max)
        
        return num_of_pages

In [9]:
# @Insaf : Here we can change the name of the company, which is not always the name, but sometimes another URL
#So depending on the company, we need to adjust this
#company = 'leocare.eu'
company = 'www.carrefour.fr'
# @Insaf : Also, we can set the number of returned pages, note that this takes time
max_pages = 100
#@Insaf : create an object from the ReviewScrapper class
RC= ReviewScrapper(company,max_pages)

In [10]:
# the data frame that will get the scrapped data:
carrefour_df = RC.scrap_data()

Scrapping page  0  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=1
Scrapping page  1  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=2
Scrapping page  2  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=3
Scrapping page  3  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=4
Scrapping page  4  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=5
Scrapping page  5  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=6
Scrapping page  6  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=7
Scrapping page  7  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=8
Scrapping page  8  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=9
Scrapping page  9  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=10
Scrapping page  10  from url  https://fr.trustpilot.com/review/www.carrefour.fr?page=11
Scrapping page  11  from url  https://fr.trustpilot

In [21]:
CARREFOUR_DATA_FILEPATH  = "/Users/carlosito/Library/CloudStorage/OneDrive-Personal/Personal Document/Hexamind/hexamind_code/Carrefour/Clustering/FinalClassfier/data"
carrefour_df.to_csv("final_carrefour_df")

## For E Leclerc

In [11]:
company = 'www.e-leclerc.com'
max_pages = 100
leclerc_RC= ReviewScrapper(company, max_pages)

In [13]:
leclerc_df = leclerc_RC.scrap_data()

Scrapping page  0  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=1
Scrapping page  1  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=2
Scrapping page  2  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=3
Scrapping page  3  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=4
Scrapping page  4  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=5
Scrapping page  5  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=6
Scrapping page  6  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=7
Scrapping page  7  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=8
Scrapping page  8  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=9
Scrapping page  9  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=10
Scrapping page  10  from url  https://fr.trustpilot.com/review/www.e-leclerc.com?page=11
Scrapping page  11  from url  https://fr

In [14]:
leclerc_df.shape

(1971, 6)

## For Auchan


In [15]:
company = 'www.auchan.fr'
max_pages = 100
auchan_RC= ReviewScrapper(company, max_pages)
auchan_df = auchan_RC.scrap_data()
print(auchan_df.shape)

Scrapping page  0  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=1
Scrapping page  1  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=2
Scrapping page  2  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=3
Scrapping page  3  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=4
Scrapping page  4  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=5
Scrapping page  5  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=6
Scrapping page  6  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=7
Scrapping page  7  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=8
Scrapping page  8  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=9
Scrapping page  9  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=10
Scrapping page  10  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=11
Scrapping page  11  from url  https://fr.trustpilot.com/review/www.auchan.fr?page=12

## For Lidl

In [16]:
company = 'www.lidl.fr'
max_pages = 100
lidl_RC= ReviewScrapper(company, max_pages)
lidl_df = lidl_RC.scrap_data()
print(lidl_df.shape)

Scrapping page  0  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=1
Scrapping page  1  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=2
Scrapping page  2  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=3
Scrapping page  3  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=4
Scrapping page  4  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=5
Scrapping page  5  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=6
Scrapping page  6  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=7
Scrapping page  7  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=8
Scrapping page  8  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=9
Scrapping page  9  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=10
Scrapping page  10  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=11
Scrapping page  11  from url  https://fr.trustpilot.com/review/www.lidl.fr?page=12
Scrapping page  12  fro

In [17]:
lidl_df.to_csv("final_lidl_df.csv")
auchan_df.to_csv("final_auchan_df.csv")
leclerc_df.to_csv("final_leclerc_df.csv")