preleviamo dati da goldbet big match

In [45]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
import json
import os

class VirtualSportsCollector:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Accept-Encoding': 'gzip, deflate, br, zstd',
            'Accept-Language': 'en-US,en;q=0.6',
            'Origin': 'https://www.eurobet.it',
            'X-EB-Accept-Language': 'it_IT',
            'X-EB-MarketId': '5',
            'X-EB-PlatformId': '1',
            'Connection': 'keep-alive'
        }
        self.base_url = "https://virtualservice.eurobet.it/virtual-winning-service/virtual-schedule/services/winningresult/55/22/{}"
        self.csv_filename = "virtual_matches_data.csv"
        self.excel_filename = "virtual_matches_data.xlsx"

    def create_match_id(self, row):
        """Create a unique identifier for each match"""
        return f"{row['date']}_{row['hour']}_{row['home_team']}_{row['away_team']}"

    def load_existing_data(self):
        """Load existing data from CSV if it exists"""
        if os.path.exists(self.csv_filename):
            return pd.read_csv(self.csv_filename)
        return pd.DataFrame()

    def get_virtual_data(self, start_date, end_date):
        all_matches = []
        current_date = start_date

        while current_date <= end_date:
            date_str = current_date.strftime("%d-%m-%Y")
            url = self.base_url.format(date_str)

            try:
                response = requests.get(url, headers=self.headers)
                if response.status_code == 200:
                    data = response.json()
                    if 'result' in data and 'groupDate' in data['result']:
                        for group in data['result']['groupDate']:
                            for event in group['events']:
                                match_data = {
                                    'date': event['date'],
                                    'hour': event['hour'],
                                    'home_team': event['eventDescription'].split(' - ')[0],
                                    'away_team': event['eventDescription'].split(' - ')[1],
                                    'score': event['finalResult'],
                                    'home_goals': int(event['finalResult'].split('-')[0]),
                                    'away_goals': int(event['finalResult'].split('-')[1]),
                                    'datetime': pd.to_datetime(f"{event['date']} {event['hour']}", format='%d-%m-%Y %H:%M:%S')
                                }

                                for odd_group in event['oddGroup']:
                                    if odd_group['betDescriptionAbbr'] == '1X2':
                                        match_data['odds_1'] = odd_group['odds'][0]
                                        match_data['result'] = odd_group['resultDescription'][0]
                                    elif odd_group['betDescriptionAbbr'] == 'U/O 2.5':
                                        match_data['over_under_25'] = odd_group['resultDescription'][0]
                                        match_data['odds_over_under_25'] = odd_group['odds'][0]
                                    elif odd_group['betDescriptionAbbr'] == 'Goal/No Goal':
                                        match_data['goal_no_goal'] = odd_group['resultDescription'][0]
                                        match_data['odds_goal_no_goal'] = odd_group['odds'][0]

                                all_matches.append(match_data)

                time.sleep(1)  # Respect rate limiting
            except Exception as e:
                print(f"Error fetching data for {date_str}: {e}")

            current_date += timedelta(days=1)

        return pd.DataFrame(all_matches)

    def merge_and_save_data(self, new_data):
        """Merge new data with existing data, remove duplicates, and save"""
        existing_data = self.load_existing_data()

        if not existing_data.empty:
            # Convert datetime column in existing data if it's not already datetime
            existing_data['datetime'] = pd.to_datetime(existing_data['datetime'])

        # Combine existing and new data
        combined_data = pd.concat([existing_data, new_data], ignore_index=True)

        # Create unique identifier for each match
        combined_data['match_id'] = combined_data.apply(self.create_match_id, axis=1)

        # Remove duplicates based on match_id
        combined_data = combined_data.drop_duplicates(subset=['match_id'], keep='first')

        # Sort by datetime in descending order (most recent first)
        combined_data = combined_data.sort_values('datetime', ascending=False)

        # Drop the match_id column as it's no longer needed
        combined_data = combined_data.drop('match_id', axis=1)

        # Save to CSV and Excel
        combined_data.to_csv(self.csv_filename, index=False)
        combined_data.to_excel(self.excel_filename, index=False)

        return combined_data

    def collect_data(self, days_back=1):
        """Main method to collect and process data"""
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days_back)

        print(f"Collecting data from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

        new_data = self.get_virtual_data(start_date, end_date)
        if not new_data.empty:
            final_data = self.merge_and_save_data(new_data)
            print(f"Data saved successfully")
            print(f"Total matches in database: {len(final_data)}")
            print(f"Files saved as: {self.csv_filename} and {self.excel_filename}")
        else:
            print("No new data collected")

def main(days_back=1):
    collector = VirtualSportsCollector()
    collector.collect_data(days_back)

if __name__ == "__main__":
    main()  # Default 90 days

Collecting data from 2024-11-20 to 2024-11-21
Data saved successfully
Total matches in database: 19811
Files saved as: virtual_matches_data.csv and virtual_matches_data.xlsx


predizione

In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
import requests
from datetime import datetime, timedelta
import os

warnings.filterwarnings('ignore')


class SoccerPredictor:
    def __init__(self):
        self.team_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.lstm_model = None
        self.sequence_length = 3

    def prepare_features(self, df):
        df = df.copy()
        df['datetime'] = pd.to_datetime(df['datetime'])
        df = df.sort_values('datetime')

        # Base features
        features = pd.DataFrame({
            'datetime': df['datetime'],
            'home_team': df['home_team'],
            'away_team': df['away_team'],
            'odds_1': df['odds_1'].astype(float)
        })

        # Encode teams
        all_teams = pd.concat([df['home_team'], df['away_team']]).unique()
        self.team_encoder.fit(all_teams)
        features['home_team_enc'] = self.team_encoder.transform(df['home_team'])
        features['away_team_enc'] = self.team_encoder.transform(df['away_team'])

        # Calculate team stats
        for team in all_teams:
            home_matches = df[df['home_team'] == team]
            away_matches = df[df['away_team'] == team]

            home_rolling = home_matches['home_goals'].rolling(3, min_periods=1).mean()
            away_rolling = away_matches['away_goals'].rolling(3, min_periods=1).mean()

            team_idx = self.team_encoder.transform([team])[0]
            features.loc[features['home_team_enc'] == team_idx, 'home_rolling_goals'] = home_rolling.values
            features.loc[features['away_team_enc'] == team_idx, 'away_rolling_goals'] = away_rolling.values

        feature_cols = ['home_team_enc', 'away_team_enc', 'odds_1', 'home_rolling_goals', 'away_rolling_goals']
        X = features[feature_cols].fillna(0)
        X_scaled = self.scaler.fit_transform(X)

        y = (df['result'] == 'X').astype(int)
        X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
        X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

        return X_train, X_valid, X_test, y_train, y_valid, y_test, features[['datetime', 'home_team', 'away_team']]

    def create_sequences(self, features):
        sequences = [features[i:i + self.sequence_length] for i in range(len(features) - self.sequence_length + 1)]
        return np.array(sequences)

    def train_model(self, df):
        try:
            X_train, X_valid, X_test, y_train, y_valid, y_test, _ = self.prepare_features(df)

            X_seq = self.create_sequences(X_train)
            y_seq = y_train[self.sequence_length - 1:]

            self.lstm_model = Sequential([
                LSTM(64, input_shape=(self.sequence_length, X_train.shape[1])),
                BatchNormalization(),
                Dropout(0.3),
                Dense(32, activation='relu'),
                Dropout(0.2),
                Dense(1, activation='sigmoid')
            ])

            self.lstm_model.compile(
                optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy']
            )

            self.lstm_model.fit(
                X_seq, y_seq, epochs=30, batch_size=32,
                validation_split=0.2, callbacks=[EarlyStopping(patience=5, restore_best_weights=True)], verbose=0
            )

            lstm_pred = (self.lstm_model.predict(self.create_sequences(X_train)) > 0.5).flatten()

            return {
                'accuracy': accuracy_score(y_seq, lstm_pred),
                'precision': precision_score(y_seq, lstm_pred),
                'recall': recall_score(y_seq, lstm_pred),
                'f1': f1_score(y_seq, lstm_pred)
            }

        except Exception as e:
            print(f"Training error: {str(e)}")
            raise

    def predict_draws(self, new_data):
        try:
            X_new, _, _, _, _, _, match_info = self.prepare_features(new_data)
            X_seq = self.create_sequences(X_new)

            lstm_probs = self.lstm_model.predict(X_seq).flatten()

            min_len = min(len(lstm_probs), len(match_info['datetime'].iloc[self.sequence_length - 1:]))

            predictions = pd.DataFrame({
                'datetime': match_info['datetime'].iloc[self.sequence_length - 1: self.sequence_length - 1 + min_len],
                'home_team': match_info['home_team'].iloc[self.sequence_length - 1: self.sequence_length - 1 + min_len],
                'away_team': match_info['away_team'].iloc[self.sequence_length - 1: self.sequence_length - 1 + min_len],
                'lstm_prob': lstm_probs[:min_len],
                'draw_probability': lstm_probs[:min_len]
            })

            return predictions

        except Exception as e:
            print(f"Prediction error: {str(e)}")
            raise


def main():
    try:
        print("Loading data...")
        df = pd.read_csv('virtual_matches_data.csv')
        print(f"Loaded {len(df)} matches")

        print("\nTraining LSTM model...")
        predictor = SoccerPredictor()
        metrics = predictor.train_model(df)

        print("\nLSTM Model Performance:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.3f}")

        predictor.lstm_model.save('lstm_model.keras')
        print("LSTM model saved successfully.")

        print("\nPredicting upcoming matches...")
        latest_date = pd.to_datetime(df['datetime']).max()
        future_matches = df[pd.to_datetime(df['datetime']) > latest_date - pd.Timedelta(hours=1)]

        if len(future_matches) > 0:
            predictions = predictor.predict_draws(future_matches)

            print("\nDraw Probabilities:")
            for _, row in predictions.iterrows():
                print(f"{row['home_team']} vs {row['away_team']}: {row['draw_probability']:.3f}")
        else:
            print("No upcoming matches found")

    except Exception as e:
        print(f"Error: {str(e)}")
        raise


if __name__ == "__main__":
    main()


Loading data...
Loaded 19811 matches

Training models...
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

Model Performance:

XGB Metrics:
accuracy: 0.924
precision: 0.775
recall: 1.000
f1: 0.873

LSTM Metrics:
accuracy: 0.994
precision: 0.979
recall: 0.997
f1: 0.988
Models saved successfully.

Predicting upcoming matches...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step

Draw Probabilities:
ROM vs NAP: 0.156
MIL vs FIO: 0.825
LAZ vs INT: 0.914
JUV vs NAP: 0.850
MIL vs ROM: 0.243
INT vs NAP: 0.161
ROM vs JUV: 0.163
LAZ vs FIO: 0.152


Dati da prevedere.

In [4]:
import requests
import pandas as pd
from datetime import datetime
import time
import json
from pathlib import Path

class VirtualOddsCollector:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Accept-Encoding': 'gzip, deflate, br, zstd',
            'Accept-Language': 'en-US,en;q=0.6',
            'Origin': 'https://www.eurobet.it',
            'X-EB-Accept-Language': 'it_IT',
            'X-EB-MarketId': '5',
            'X-EB-PlatformId': '1',
            'Connection': 'keep-alive',
            'Referer': 'https://www.eurobet.it/'
        }
        self.data_dir = Path('data')
        self.data_dir.mkdir(exist_ok=True)
        self.csv_filename = self.data_dir / 'virtual_odds_detail.csv'
        self.base_url = "https://virtualservice.eurobet.it/virtual-detail-service/virtual-schedule/services/22/sport/{}"

        print(f"File will be saved to: {self.csv_filename}")

    def get_match_odds(self, match_code):
        """Get odds for a specific match"""
        url = self.base_url.format(match_code)
        try:
            print(f"Fetching data for match code: {match_code}")
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()

            data = response.json()
            if 'result' not in data or 'eventdetail' not in data['result']:
                print(f"No valid data found for match {match_code}")
                return None

            event_info = data['result']['eventdetail']['eventInfo']
            bet_groups = data['result']['eventdetail']['betGroupList']

            match_data = {
                'match_code': match_code,
                'timestamp': datetime.fromtimestamp(event_info['eventData']/1000).strftime('%Y-%m-%d %H:%M:%S'),
                'home_team': event_info['teamHomeDescription'],
                'away_team': event_info['teamAwayDescription'],
                'channel': event_info['channelDescription'],
                'event_code': event_info['eventCode'],
                'program_code': event_info['programCode'],
                'collection_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }

            for bet_group in bet_groups:
                bet_type = bet_group['betDescription']
                market_id = bet_group['marketId']

                if bet_group['oddGroupList'] and bet_group['oddGroupList'][0]['oddList']:
                    odds = bet_group['oddGroupList'][0]['oddList']

                    for odd in odds:
                        desc = odd['oddDescription']
                        value = odd['oddValue']
                        result_code = odd['resultCode']

                        key_base = f"{bet_type}_{desc}".lower().replace('/', '_').replace(' ', '_')
                        match_data[f"{key_base}_odds"] = value
                        match_data[f"{key_base}_code"] = result_code

            print(f"Successfully collected odds for {match_data['home_team']} vs {match_data['away_team']}")
            return match_data

        except requests.exceptions.RequestException as e:
            print(f"Network error for match {match_code}: {e}")
        except json.JSONDecodeError as e:
            print(f"JSON parsing error for match {match_code}: {e}")
        except Exception as e:
            print(f"Unexpected error for match {match_code}: {e}")
        return None

    def collect_matches(self, match_codes):
        """Collect odds for multiple matches"""
        all_matches = []

        print(f"\nStarting collection for match codes: {match_codes}")

        for match_code in match_codes:
            match_data = self.get_match_odds(match_code)
            if match_data:
                all_matches.append(match_data)
            time.sleep(1)

        if not all_matches:
            print("No match data collected!")
            return pd.DataFrame()

        df = pd.DataFrame(all_matches)

        if self.csv_filename.exists():
            try:
                existing_df = pd.read_csv(self.csv_filename)
                print(f"Loaded {len(existing_df)} existing records")
                df = pd.concat([existing_df, df], ignore_index=True)
                df = df.drop_duplicates(subset=['match_code', 'timestamp'], keep='last')
                print(f"After merging and removing duplicates: {len(df)} records")
            except Exception as e:
                print(f"Error loading existing data: {e}")

        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.sort_values('timestamp', ascending=False)

        try:
            df.to_csv(self.csv_filename, index=False)
            print(f"\nSuccessfully saved data to {self.csv_filename}")
            print(f"Total matches in database: {len(df)}")

            print("\nPreview of saved data:")
            basic_cols = ['timestamp', 'home_team', 'away_team']
            odds_cols = [col for col in df.columns if '1x2_finale' in col and 'odds' in col]
            print(df[basic_cols + odds_cols].head())

        except Exception as e:
            print(f"Error saving data: {e}")

        return df

# Variabili per generare i codici delle partite
BASE_CODE = "55_2402405092"  # Codice base
START_NUMBER = 411  # Numero iniziale

if __name__ == "__main__":
    # Genera 5 codici incrementando di 2 ogni volta
    match_codes = []
    current_number = START_NUMBER

    for _ in range(5):
        match_code = f"{BASE_CODE}_{current_number}"
        match_codes.append(match_code)
        current_number += 2

    collector = VirtualOddsCollector()
    collector.collect_matches(match_codes)

File will be saved to: data\virtual_odds_detail.csv

Starting collection for match codes: ['55_2402405092_411', '55_2402405092_413', '55_2402405092_415', '55_2402405092_417', '55_2402405092_419']
Fetching data for match code: 55_2402405092_411
Successfully collected odds for INT vs ROM
Fetching data for match code: 55_2402405092_413
Successfully collected odds for WHU vs TOT
Fetching data for match code: 55_2402405092_415
Successfully collected odds for INT vs JUV
Fetching data for match code: 55_2402405092_417
Successfully collected odds for GAL vs IBB
Fetching data for match code: 55_2402405092_419
Successfully collected odds for LAZ vs ROM
Loaded 16 existing records
After merging and removing duplicates: 19 records

Successfully saved data to data\virtual_odds_detail.csv
Total matches in database: 19

Preview of saved data:
             timestamp home_team away_team  1x2_finale_1_odds  \
20 2024-11-21 23:58:00       LAZ       ROM                197   
19 2024-11-21 23:53:00       GA

Loading data...
Loaded 19811 matches

Training LSTM model...
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

LSTM Model Performance:
accuracy: 0.996
precision: 0.989
recall: 0.997
f1: 0.993
LSTM model saved successfully.

Predicting upcoming matches...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step

Draw Probabilities:
ROM vs NAP: 0.000
MIL vs FIO: 0.906
LAZ vs INT: 1.000
JUV vs NAP: 0.958
MIL vs ROM: 0.014
INT vs NAP: 0.000
ROM vs JUV: 0.000
LAZ vs FIO: 0.000
