# Comeup Services Scraper - Step by Step Analysis

This notebook refactors the Comeup scraper into a progressive, functional approach for better understanding and debugging of the scraping process.

## 1. Import Required Libraries

Import all necessary libraries for web scraping, data manipulation, and analysis.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from urllib.parse import urljoin
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

print("All libraries imported successfully!")

All libraries imported successfully!


## 2. Setup Session and Configuration

Initialize the requests session, set headers, and define configuration variables.

In [2]:
# Configuration
BASE_URL = "https://comeup.com"

# Create session with proper headers
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})

# Global variables for data collection
scraped_services = []

print("Session configured successfully!")
print(f"Base URL: {BASE_URL}")
print(f"User Agent: {session.headers['User-Agent']}")

Session configured successfully!
Base URL: https://comeup.com
User Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36


## 3. Define Data Extraction Functions

Create individual functions for extracting different pieces of data from service pages.

In [3]:
def extract_seller_name(soup):
    """Extrait le nom du vendeur"""
    selectors = [
        '.seller-name',
        '.username',
        '[data-testid="seller-name"]',
        '.profile-username'
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            return element.get_text(strip=True)
    return "N/A"

def extract_service_title(soup):
    """Extrait le titre du service"""
    selectors = [
        'h1',
        '.service-title',
        '[data-testid="service-title"]'
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            return element.get_text(strip=True)
    return "N/A"

def categorize_by_keywords(text):
    """Catégorise un service basé sur des mots-clés"""
    categories = {
        'Marketing Digital': ['email', 'marketing', 'publicité', 'ads', 'seo', 'social media'],
        'Rédaction': ['rédiger', 'écrire', 'contenu', 'article', 'blog'],
        'Design': ['logo', 'design', 'graphique', 'bannière', 'visuel'],
        'Développement': ['site web', 'application', 'développement', 'code'],
        'Vidéo': ['montage', 'vidéo', 'animation', 'motion'],
        'Audio': ['voix off', 'musique', 'audio', 'podcast'],
        'Traduction': ['traduction', 'traduire', 'langue'],
        'Business': ['business plan', 'stratégie', 'conseil']
    }
    
    text_lower = text.lower()
    for category, keywords in categories.items():
        if any(keyword in text_lower for keyword in keywords):
            return category
    
    return 'Autre'

def extract_category(soup):
    """Extrait la catégorie du service"""
    selectors = [
        '.breadcrumb li:last-child',
        '.category-name',
        '[data-testid="category"]'
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            return element.get_text(strip=True)
    
    # Si pas trouvé, essayer de déduire de la description
    title = extract_service_title(soup)
    return categorize_by_keywords(title)

print("Extraction functions defined successfully!")

Extraction functions defined successfully!


In [4]:
def extract_price(soup):
    """Extrait le prix du service"""
    selectors = [
        '.price',
        '.service-price',
        '[data-testid="price"]'
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            price_text = element.get_text(strip=True)
            price_match = re.search(r'(\d+(?:[,\.]\d+)?)', price_text.replace('€', '').replace(',', '.'))
            if price_match:
                return float(price_match.group(1))
    return 0.0

def extract_rating(soup):
    """Extrait la note moyenne"""
    selectors = [
        '.rating',
        '.stars',
        '[data-testid="rating"]'
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            rating_text = element.get_text(strip=True)
            rating_match = re.search(r'(\d+(?:[,\.]\d+)?)', rating_text)
            if rating_match:
                return float(rating_match.group(1).replace(',', '.'))
    return 0.0

def extract_sales_count(soup):
    """Extrait le nombre total de ventes"""
    selectors = [
        '.sales-count',
        '.orders-completed',
        '[data-testid="sales"]'
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            sales_text = element.get_text(strip=True)
            sales_match = re.search(r'(\d+)', sales_text)
            if sales_match:
                return int(sales_match.group(1))
    return 0

def extract_reviews(soup):
    """Extrait les avis (positifs et négatifs)"""
    reviews = {'positifs': 0, 'negatifs': 0}
    
    review_elements = soup.select('.review, .comment, .feedback')
    
    for review in review_elements:
        review_text = review.get_text(strip=True).lower()
        
        positive_keywords = ['excellent', 'parfait', 'recommande', 'super', 'génial', 'satisfait']
        negative_keywords = ['décevant', 'mauvais', 'problème', 'insatisfait', 'nul']
        
        if any(keyword in review_text for keyword in positive_keywords):
            reviews['positifs'] += 1
        elif any(keyword in review_text for keyword in negative_keywords):
            reviews['negatifs'] += 1
    
    return reviews

def extract_seller_since(soup):
    """Extrait depuis quand le vendeur est inscrit"""
    selectors = [
        '.member-since',
        '.seller-since',
        '[data-testid="member-since"]'
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            return element.get_text(strip=True)
    return "N/A"

print("All data extraction functions ready!")

All data extraction functions ready!


## 4. Test Single Service Scraping

Test the extraction functions on a single service URL to verify they work correctly.

In [None]:
def extract_service_data(service_url):
    """Extrait toutes les données d'un service spécifique"""
    try:
        print(f"Extracting data from: {service_url}")
        response = session.get(service_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract reviews data
        reviews = extract_reviews(soup)
        
        # Compile all data
        data = {
            'url': service_url,
            'nom_vendeur': extract_seller_name(soup),
            'titre_service': extract_service_title(soup),
            'categorie': extract_category(soup),
            'prix': extract_price(soup),
            'note': extract_rating(soup),
            'nombre_vente_total': extract_sales_count(soup),
            'avis_positifs': reviews['positifs'],
            'avis_negatifs': reviews['negatifs'],
            'vendeur_depuis': extract_seller_since(soup),
            'date_scraping': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        
        return data
        
    except Exception as e:
        print(f"Erreur lors de l'extraction de {service_url}: {e}")
        return None

# Test with a sample URL (you'll need to replace with an actual Comeup service URL)
test_url = "https://comeup.com/fr/service/example"  # Replace with actual URL
print("Testing single service extraction...")
print("(Note: Replace test_url with an actual Comeup service URL to test)")

# Uncomment the lines below when you have a real URL to test
# test_data = extract_service_data(test_url)
# if test_data:
#     for key, value in test_data.items():
#         print(f"{key}: {value}")

## 5. Create Category Scraping Function

Build a function to scrape service URLs from category pages with pagination support.

In [None]:
def scrape_services_list(category_url, max_pages=5):
    """Scrape une liste de services depuis une catégorie"""
    services_urls = []
    
    print(f"Scraping category: {category_url}")
    
    for page in range(1, max_pages + 1):
        try:
            url = f"{category_url}?page={page}"
            print(f"  Scraping page {page}...")
            
            response = session.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Sélecteurs pour les liens de services
            service_links = soup.select('a[href*="/service/"]')
            
            page_services = 0
            for link in service_links:
                service_url = urljoin(BASE_URL, link['href'])
                if service_url not in services_urls:
                    services_urls.append(service_url)
                    page_services += 1
            
            print(f"    Found {page_services} new services on page {page}")
            
            # If no services found, probably reached the end
            if page_services == 0:
                print(f"    No new services found on page {page}, stopping pagination")
                break
                
            time.sleep(1)  # Pause entre les requêtes
            
        except Exception as e:
            print(f"    Erreur page {page}: {e}")
            continue
    
    print(f"  Total services found: {len(services_urls)}")
    return services_urls

# Test the function (replace with actual category URL)
test_category = "https://comeup.com/fr/best-services"
print("Testing category scraping...")
print("(Note: This will attempt to scrape actual URLs)")

# Uncomment to test with real URLs
# test_services = scrape_services_list(test_category, max_pages=2)
# print(f"Found {len(test_services)} services")
# if test_services:
#     print("First 5 URLs:")
#     for url in test_services[:5]:
#         print(f"  {url}")

## 6. Build Complete Scraping Pipeline

Combine all functions into a complete pipeline that scrapes multiple categories.

In [None]:
def scrape_multiple_categories(category_urls, max_pages_per_category=3, max_services_per_category=10):
    """Pipeline complet de scraping pour plusieurs catégories"""
    all_data = []
    
    print(f"Starting scraping pipeline for {len(category_urls)} categories")
    print(f"Max pages per category: {max_pages_per_category}")
    print(f"Max services per category: {max_services_per_category}")
    print("-" * 50)
    
    for i, category_url in enumerate(category_urls, 1):
        print(f"\n[{i}/{len(category_urls)}] Processing category: {category_url}")
        
        try:
            # Get service URLs from this category
            service_urls = scrape_services_list(category_url, max_pages_per_category)
            
            # Limit number of services to scrape per category
            service_urls = service_urls[:max_services_per_category]
            
            # Extract data from each service
            for j, service_url in enumerate(service_urls, 1):
                print(f"  [{j}/{len(service_urls)}] Extracting: {service_url}")
                
                service_data = extract_service_data(service_url)
                if service_data:
                    service_data['source_category'] = category_url
                    all_data.append(service_data)
                    print(f"    ✓ Success: {service_data['titre_service'][:50]}...")
                else:
                    print(f"    ✗ Failed to extract data")
                
                time.sleep(2)  # Pause entre les requêtes
                
        except Exception as e:
            print(f"  Error processing category {category_url}: {e}")
            continue
    
    print(f"\n" + "="*50)
    print(f"Scraping completed! Total services scraped: {len(all_data)}")
    
    return all_data

# Example usage (replace with actual URLs)
categories_to_scrape = [
    "https://comeup.com/fr/best-services",
    "https://comeup.com/fr/category/site-developpement",
]

print("Scraping pipeline ready!")
print("Categories configured:")
for cat in categories_to_scrape:
    print(f"  - {cat}")

## 7. Export Data to CSV

Save the collected data to a CSV file and display summary statistics.

In [None]:
def export_data_to_csv(data, filename="comeup_analysis.csv"):
    """Exporte les données vers un fichier CSV"""
    if not data:
        print("No data to export!")
        return None
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Save to CSV
    filepath = f"c:\\Users\\ME-PC\\Desktop\\ERAYDIGITAL\\projects\\perso\\COMEUP\\{filename}"
    df.to_csv(filepath, index=False, encoding='utf-8')
    
    print(f"Data exported successfully to: {filepath}")
    print(f"Total records: {len(df)}")
    
    return df

def display_summary_statistics(df):
    """Affiche des statistiques de résumé"""
    if df is None or df.empty:
        print("No data to analyze!")
        return
    
    print("\n" + "="*50)
    print("SUMMARY STATISTICS")
    print("="*50)
    
    print(f"Total services scraped: {len(df)}")
    print(f"Unique sellers: {df['nom_vendeur'].nunique()}")
    print(f"Categories found: {df['categorie'].nunique()}")
    
    print(f"\nPrice statistics:")
    print(f"  Average price: €{df['prix'].mean():.2f}")
    print(f"  Median price: €{df['prix'].median():.2f}")
    print(f"  Min price: €{df['prix'].min():.2f}")
    print(f"  Max price: €{df['prix'].max():.2f}")
    
    print(f"\nRating statistics:")
    print(f"  Average rating: {df['note'].mean():.2f}")
    print(f"  Services with rating > 4: {len(df[df['note'] > 4])}")
    
    print(f"\nTop 5 categories:")
    print(df['categorie'].value_counts().head())
    
    print(f"\nTop 5 sellers by number of services:")
    print(df['nom_vendeur'].value_counts().head())

# Example of how to use these functions:
print("Export and analysis functions ready!")
print("\nTo use these functions:")
print("1. First run the scraping pipeline:")
print("   scraped_data = scrape_multiple_categories(categories_to_scrape)")
print("2. Then export the data:")
print("   df = export_data_to_csv(scraped_data)")
print("3. Finally display statistics:")
print("   display_summary_statistics(df)")

## 8. Data Analysis and Visualization

Perform basic analysis on the scraped data with visualizations.

In [None]:
def create_visualizations(df):
    """Crée des visualisations des données scrapées"""
    if df is None or df.empty:
        print("No data to visualize!")
        return
    
    plt.style.use('default')
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Comeup Services Analysis', fontsize=16, fontweight='bold')
    
    # 1. Price Distribution
    axes[0, 0].hist(df['prix'], bins=20, edgecolor='black', alpha=0.7, color='skyblue')
    axes[0, 0].set_title('Distribution des Prix')
    axes[0, 0].set_xlabel('Prix (€)')
    axes[0, 0].set_ylabel('Nombre de services')
    
    # 2. Category Distribution
    category_counts = df['categorie'].value_counts().head(10)
    axes[0, 1].bar(range(len(category_counts)), category_counts.values, color='lightcoral')
    axes[0, 1].set_title('Top 10 Catégories')
    axes[0, 1].set_xlabel('Catégories')
    axes[0, 1].set_ylabel('Nombre de services')
    axes[0, 1].set_xticks(range(len(category_counts)))
    axes[0, 1].set_xticklabels(category_counts.index, rotation=45, ha='right')
    
    # 3. Price vs Rating Scatter
    axes[1, 0].scatter(df['prix'], df['note'], alpha=0.6, color='green')
    axes[1, 0].set_title('Prix vs Note')
    axes[1, 0].set_xlabel('Prix (€)')
    axes[1, 0].set_ylabel('Note')
    
    # 4. Sales Distribution
    axes[1, 1].hist(df['nombre_vente_total'], bins=15, edgecolor='black', alpha=0.7, color='orange')
    axes[1, 1].set_title('Distribution des Ventes')
    axes[1, 1].set_xlabel('Nombre de ventes')
    axes[1, 1].set_ylabel('Nombre de services')
    
    plt.tight_layout()
    plt.show()

def analyze_top_performers(df):
    """Analyse les services les plus performants"""
    if df is None or df.empty:
        print("No data to analyze!")
        return
    
    print("\n" + "="*50)
    print("TOP PERFORMERS ANALYSIS")
    print("="*50)
    
    # Top services by sales
    print("\nTop 10 services by sales:")
    top_sales = df.nlargest(10, 'nombre_vente_total')[['titre_service', 'nom_vendeur', 'nombre_vente_total', 'prix', 'note']]
    print(top_sales.to_string(index=False))
    
    # Top services by rating (with minimum sales)
    high_sales = df[df['nombre_vente_total'] >= 5]  # At least 5 sales
    if not high_sales.empty:
        print(f"\nTop 10 services by rating (min 5 sales):")
        top_rated = high_sales.nlargest(10, 'note')[['titre_service', 'nom_vendeur', 'note', 'nombre_vente_total', 'prix']]
        print(top_rated.to_string(index=False))
    
    # Price analysis by category
    print(f"\nAverage price by category:")
    price_by_category = df.groupby('categorie')['prix'].agg(['mean', 'count']).round(2)
    price_by_category = price_by_category.sort_values('mean', ascending=False)
    print(price_by_category.to_string())

def generate_insights(df):
    """Génère des insights sur les données"""
    if df is None or df.empty:
        print("No data to analyze!")
        return
    
    print("\n" + "="*50)
    print("BUSINESS INSIGHTS")
    print("="*50)
    
    # Price insights
    low_price = df['prix'].quantile(0.25)
    high_price = df['prix'].quantile(0.75)
    print(f"Price segments:")
    print(f"  Low price (bottom 25%): ≤ €{low_price:.2f}")
    print(f"  High price (top 25%): ≥ €{high_price:.2f}")
    
    # Success factors
    successful_services = df[df['nombre_vente_total'] >= df['nombre_vente_total'].quantile(0.8)]
    if not successful_services.empty:
        print(f"\nCharacteristics of top 20% sellers:")
        print(f"  Average price: €{successful_services['prix'].mean():.2f}")
        print(f"  Average rating: {successful_services['note'].mean():.2f}")
        print(f"  Most common categories:")
        print(successful_services['categorie'].value_counts().head(3).to_string())
    
    # Category opportunities
    print(f"\nCategory analysis:")
    category_stats = df.groupby('categorie').agg({
        'prix': 'mean',
        'note': 'mean',
        'nombre_vente_total': 'mean'
    }).round(2)
    category_stats['service_count'] = df['categorie'].value_counts()
    category_stats = category_stats.sort_values('prix', ascending=False)
    print(category_stats.head(10).to_string())

print("Visualization and analysis functions ready!")
print("\nTo create visualizations after scraping:")
print("  create_visualizations(df)")
print("  analyze_top_performers(df)")
print("  generate_insights(df)")

## Complete Execution Example

Here's how to run the complete pipeline:

In [None]:
# Complete execution example (uncomment to run with real data)

def run_complete_analysis():
    """Exécute l'analyse complète"""
    print("Starting complete Comeup analysis...")
    
    # 1. Define categories to scrape
    categories = [
        "https://comeup.com/fr/best-services",
        "https://comeup.com/fr/category/site-developpement",
    ]
    
    # 2. Scrape data (limit for testing)
    print("Step 1: Scraping data...")
    scraped_data = scrape_multiple_categories(
        categories, 
        max_pages_per_category=2, 
        max_services_per_category=5
    )
    
    if not scraped_data:
        print("No data scraped! Check your URLs and internet connection.")
        return
    
    # 3. Export to CSV
    print("\nStep 2: Exporting data...")
    df = export_data_to_csv(scraped_data, "comeup_analysis_complete.csv")
    
    # 4. Display statistics
    print("\nStep 3: Analyzing data...")
    display_summary_statistics(df)
    
    # 5. Create visualizations
    print("\nStep 4: Creating visualizations...")
    create_visualizations(df)
    
    # 6. Analyze top performers
    analyze_top_performers(df)
    
    # 7. Generate insights
    generate_insights(df)
    
    return df

# Instructions for execution
print("EXECUTION INSTRUCTIONS:")
print("="*50)
print("1. Ensure you have valid Comeup URLs in the categories list")
print("2. Check your internet connection")
print("3. Be mindful of the website's robots.txt and terms of service")
print("4. Start with small limits (few pages, few services) for testing")
print("5. Uncomment and run: df = run_complete_analysis()")
print("\nIMPORTANT: Replace example URLs with actual Comeup service URLs!")

# Uncomment the line below to run the complete analysis
# df = run_complete_analysis()

## Notes and Best Practices

- Always respect the website's robots.txt and terms of service
- Use appropriate delays between requests to avoid overloading the server
- Test with small datasets first before running large scraping operations
- Handle errors gracefully and implement retry logic for production use
- Consider using proxy rotation for large-scale scraping
- Monitor your scraping performance and adjust parameters accordingly

The notebook is now ready for progressive execution. Each cell can be run independently to test specific functionality before running the complete pipeline.