# Sisal Scraper Test

This notebook tests the Sisal scraper functionality. Since the Sisal website may not be accessible from all locations due to geographic restrictions, we'll also demonstrate the parsing logic with mock data.

In [1]:
import sys
import os

# Add the project root to Python path
# Get the current notebook directory and go up one level to reach project root
current_dir = os.path.dirname(os.path.abspath('__file__' if '__file__' in globals() else 'scraper_test.ipynb'))
project_root = os.path.dirname(current_dir)

if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Import the improved scraper
from src.scraper.sisal_scraper_v2 import scrape_sisal_odds

In [None]:
# Test the improved scraper with anti-detection measures
print("=== Testing Improved Sisal Scraper ===")

# First test with the original URL
url = "https://www.sisal.it/scommesse-live/evento/calcio/euro-u19/germania-u19-olanda-u19"
print(f"Testing URL: {url}")
print("This may take 30-60 seconds due to anti-detection measures...")

odds = scrape_sisal_odds(url)

if odds:
    print("✅ SUCCESS! Scraper bypassed blocking")
    print(f"Match: {odds.home_team} vs {odds.away_team}")
    print(f"1X2: {odds.home_win} / {odds.draw} / {odds.away_win}")
    print(f"Over/Under 2.5: {odds.over_2_5} / {odds.under_2_5}")
    print(f"BTTS: {odds.both_teams_score_yes} / {odds.both_teams_score_no}")
else:
    print("❌ Still failed - website may be using more advanced blocking")
    print("Let's try the original URL that worked in your browser...")
    
    # Try the original URL
    original_url = "https://www.sisal.it/scommesse-live/evento/calcio/argentina/primera-b-metropolitana/douglas-haig-gimnasia-y-esgrima-de-concepcion-del-uruguay"
    print(f"\nTrying original URL: {original_url}")
    
    odds = scrape_sisal_odds(original_url)
    if odds:
        print("✅ SUCCESS with original URL!")
        print(f"Match: {odds.home_team} vs {odds.away_team}")
        print(f"1X2: {odds.home_win} / {odds.draw} / {odds.away_win}")
    else:
        print("❌ Both URLs failed - may need additional measures")

=== Testing Improved Sisal Scraper ===
Testing URL: https://www.sisal.it/scommesse-live/evento/calcio/euro-u19/germania-u19-olanda-u19
This may take 30-60 seconds due to anti-detection measures...
🎯 Starting advanced scraping for: https://www.sisal.it/scommesse-live/evento/calcio/euro-u19/germania-u19-olanda-u19
📋 Trying strategy 1/3: _scrape_with_session_establishment
  🌐 Establishing full browser session...
  📄 Visiting homepage...
  ❌ Session establishment failed: HTTPSConnectionPool(host='www.sisal.it', port=443): Read timed out. (read timeout=20)
❌ Strategy 1 returned no result
⏱️ Waiting 2.2s before next strategy...
  ❌ Session establishment failed: HTTPSConnectionPool(host='www.sisal.it', port=443): Read timed out. (read timeout=20)
❌ Strategy 1 returned no result
⏱️ Waiting 2.2s before next strategy...
📋 Trying strategy 2/3: _scrape_with_mobile_headers
  📱 Trying mobile headers...
📋 Trying strategy 2/3: _scrape_with_mobile_headers
  📱 Trying mobile headers...
  ❌ Mobile headers

In [11]:
# Debug the connection issue
import requests
from src.scraper.sisal_scraper import SisalScraper

print("=== Debugging Sisal Connection ===")

# Test basic connectivity
try:
    print("1. Testing basic HTTP connectivity...")
    response = requests.get("https://httpbin.org/get", timeout=5)
    print(f"   ✅ Basic HTTP works: {response.status_code}")
except Exception as e:
    print(f"   ❌ Basic HTTP failed: {e}")

# Test Sisal main page
try:
    print("2. Testing Sisal main page...")
    response = requests.get("https://www.sisal.it", timeout=15)
    print(f"   ✅ Sisal main page: {response.status_code}")
except Exception as e:
    print(f"   ❌ Sisal main page failed: {e}")

# Test with scraper headers
try:
    print("3. Testing with scraper headers...")
    scraper = SisalScraper()
    response = scraper.session.get("https://www.sisal.it", timeout=15)
    print(f"   ✅ Scraper headers work: {response.status_code}")
except Exception as e:
    print(f"   ❌ Scraper headers failed: {e}")

# Test the specific URL
url = "https://www.sisal.it/scommesse-live/evento/calcio/euro-u21/spagna-u21-romania-u21"
try:
    print("4. Testing specific match URL...")
    scraper = SisalScraper()
    response = scraper.session.get(url, timeout=20)
    print(f"   ✅ Match page accessible: {response.status_code}")
    print(f"   Content length: {len(response.content)} bytes")
    
    # Check if the page contains expected content
    if "spagna" in response.text.lower() or "romania" in response.text.lower():
        print("   ✅ Page contains match content")
    else:
        print("   ⚠️ Page doesn't contain expected match content")
        
except Exception as e:
    print(f"   ❌ Match page failed: {e}")

print("\n=== Recommendations ===")
print("If connections are failing:")
print("- The match might no longer be live")
print("- Try a different/current live match URL")
print("- Check if you need a VPN for accessing Sisal")
print("- The website might be blocking automated requests")

=== Debugging Sisal Connection ===
1. Testing basic HTTP connectivity...
   ✅ Basic HTTP works: 200
2. Testing Sisal main page...
   ✅ Basic HTTP works: 200
2. Testing Sisal main page...
   ❌ Sisal main page failed: HTTPSConnectionPool(host='www.sisal.it', port=443): Read timed out. (read timeout=15)
3. Testing with scraper headers...
   ❌ Sisal main page failed: HTTPSConnectionPool(host='www.sisal.it', port=443): Read timed out. (read timeout=15)
3. Testing with scraper headers...
   ❌ Scraper headers failed: HTTPSConnectionPool(host='www.sisal.it', port=443): Read timed out. (read timeout=15)
4. Testing specific match URL...
   ❌ Scraper headers failed: HTTPSConnectionPool(host='www.sisal.it', port=443): Read timed out. (read timeout=15)
4. Testing specific match URL...
   ❌ Match page failed: HTTPSConnectionPool(host='www.sisal.it', port=443): Read timed out. (read timeout=20)

=== Recommendations ===
If connections are failing:
- The match might no longer be live
- Try a different/

In [12]:
# Demo: Scraper with Mock Data (since live site is not accessible)
from bs4 import BeautifulSoup
from datetime import datetime
from src.datamodel.betting_odds import BettingOdds
from src.scraper.sisal_scraper import SisalScraper

print("=== Testing Scraper with Mock Data ===")

# Mock HTML content based on the actual Sisal page structure
mock_html = """
<html>
<head><title>Spagna U21 - Romania U21 Live</title></head>
<body>
    <div>
        <h1>Spagna U21 - Romania U21</h1>
        <div class="match-info">1o | 40:29  0 : 1</div>
        <div class="odds">
            <p>ESITO FINALE Info Scommesse  12.75 X3.00 22.60</p>
            <p>Under/Over 2.5  UNDER2.25 OVER1.57</p>
            <p>DOPPIA CHANCE Info Scommesse  1X1.41 X21.37 121.31</p>
            <p>Goal/NoGoal GOAL1.20 NOGOAL4.00</p>
        </div>
    </div>
</body>
</html>
"""

# Test the parsing logic directly
scraper = SisalScraper()
soup = BeautifulSoup(mock_html, 'html.parser')

# Test match info extraction
url = "https://www.sisal.it/scommesse-live/evento/calcio/euro-u21/spagna-u21-romania-u21"
match_info = scraper._extract_match_info(soup, url)
print(f"Match Info: {match_info}")

# Test odds extraction
odds_data = scraper._extract_odds(soup)
print(f"Odds Data: {odds_data}")

# Create a BettingOdds instance
if match_info:
    betting_odds = BettingOdds(
        timestamp=datetime.now(),
        source="Sisal",
        match_id=match_info['match_id'],
        home_team=match_info['home_team'],
        away_team=match_info['away_team'],
        **odds_data
    )
    
    print(f"\n✅ Successfully created BettingOdds:")
    print(f"   Match: {betting_odds.home_team} vs {betting_odds.away_team}")
    print(f"   1X2: {betting_odds.home_win} / {betting_odds.draw} / {betting_odds.away_win}")
    print(f"   O/U 2.5: {betting_odds.over_2_5} / {betting_odds.under_2_5}")
    print(f"   BTTS: {betting_odds.both_teams_score_yes} / {betting_odds.both_teams_score_no}")
    print(f"   Double Chance: {betting_odds.home_or_draw} / {betting_odds.away_or_draw} / {betting_odds.home_or_away}")
else:
    print("❌ Failed to extract match info")

=== Testing Scraper with Mock Data ===
Match Info: {'home_team': 'Spagna U21', 'away_team': 'Romania U21', 'match_id': 'spagna-u21-romania-u21'}
Odds Data: {'home_win': 12.75, 'draw': 3.0, 'away_win': 22.6, 'over_2_5': 1.57, 'under_2_5': 2.25, 'both_teams_score_yes': 1.2, 'both_teams_score_no': 4.0, 'home_or_draw': 1.41, 'away_or_draw': 1.37, 'home_or_away': 1.31}

✅ Successfully created BettingOdds:
   Match: Spagna U21 vs Romania U21
   1X2: 12.75 / 3.0 / 22.6
   O/U 2.5: 1.57 / 2.25
   BTTS: 1.2 / 4.0
   Double Chance: 1.41 / 1.37 / 1.31
