In [1]:
# For get tables of competitions 
# https://soccer365.ru/competitions/16/ - current tables
# https://soccer365.ru/competitions/16/2017-2018/ - for seasons

# html = requests.get('https://soccer365.me/?c=live&a=showtable&competition_id=483&season_id=307').content
# <tr class="adv_kef_wgt_odd">

In [1]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import pytz
from datetime import datetime, timedelta

from _html_parser import HtmlParser, ParsingDataPrepare, TableConstant

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

In [2]:
%load_ext autoreload
%autoreload 2

### Get games data

In [4]:
# Last date of season in EPL with correct datatime on 'soccer365.me': 15.08.1998
# Format: start - %m/%d/%Y, end - %m/%d/%Y
numdays = ParsingDataPrepare.count_days_by_dates('09/16/2022', '05/27/2022')
numdays

112

In [6]:
# from_26_05_2022_to_27_03_2021_games, from_26_03_2021_to_03_11_2013_games, from_02_11_2013_to_10_07_2005_games, from_09_07_2005_to_01_01_2000_games
# from_29_12_1999_to_15_08_1998_games
name = 'pickle_files/new_events/16_09_2022/from_16_09_2022_to_26_05_2022_games'   

In [10]:
# HtmlParser.parsing_write_by_date(numdays=numdays, start_year=2022, start_month=9, start_day=16, file_name=name)

In [51]:
with open(name, 'rb') as f:
    games = pickle.load(f)

In [52]:
len(games)

24863

In [53]:
df_to_transform = pd.DataFrame(games)
df = ParsingDataPrepare.transform_columns_to_rows(df_to_transform, ['ligue_header', 'comp_id', 'season_id', 'game_utc', 'game_title','goals', 'game_status'])

df.reset_index(inplace=True)
df.rename(columns={'index':'game_id'}, inplace=True)
df.shape

(24863, 8)

#### Check on 'soccer365' and change datatime if need '+3 utc' on 'utc'

In [54]:
df

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,game_status
0,1736289,Premier League,12,381,16.09.2022 22:00,Aston Villa - Southampton,1:0,Finished
1,1736296,Premier League,12,381,16.09.2022 22:00,Nottingham - Fulham,2:3,Finished
2,1747456,Serie A,15,381,16.09.2022 21:45,Salernitana - Lecce,1:2,Finished
3,1744353,Primera Division,16,381,16.09.2022 22:00,Real Valladolid - Cadiz,0:1,Finished
4,1748218,Segunda División,707,381,16.09.2022 22:00,Levante - Cartagena,0:1,Finished
...,...,...,...,...,...,...,...,...
24858,1607985,Landesliga,475,310,28.05.2022 18:30,Volders - Münster,0:1,Finished
24859,1607984,Landesliga,475,310,28.05.2022 18:30,Innsbrucker AC - Mils,3:2,Finished
24860,1607740,Landesliga,475,310,28.05.2022 19:00,St. Jakob - SAK Klagenfurt,0:1,Finished
24861,1607990,Landesliga,475,310,28.05.2022 19:30,Natters - Prutz / Serfaus,3:2,Finished


In [25]:
# games.items()

In [55]:
utc = pytz.timezone('UTC')
localtz = pytz.timezone('Europe/Kiev')

err_time_keys = []
for key, val in games.items():
    if val[3][-3] != ':':    
        err_time_keys.append(key)
        continue  
    d_time = datetime.strptime(val[3], "%d.%m.%Y %H:%M")
    localtime = localtz.localize(d_time)
    utctime = utc.normalize(localtime.astimezone(utc))    
    games[key][3] = utctime.strftime('%d.%m.%Y %H:%M')

In [57]:
len(err_time_keys)

570

In [59]:
df_to_transform = pd.DataFrame(games)
df = ParsingDataPrepare.transform_columns_to_rows(df_to_transform, ['ligue_header', 'comp_id', 'season_id', 'game_utc', 'game_title','goals', 'game_status'])

df.reset_index(inplace=True)
df.rename(columns={'index':'game_id'}, inplace=True)
df.shape

(24863, 8)

In [64]:
df = df[~df.game_id.isin(err_time_keys)].copy()
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,game_status
0,1736289,Premier League,12,381,16.09.2022 19:00,Aston Villa - Southampton,1:0,Finished
1,1736296,Premier League,12,381,16.09.2022 19:00,Nottingham - Fulham,2:3,Finished
2,1747456,Serie A,15,381,16.09.2022 18:45,Salernitana - Lecce,1:2,Finished
3,1744353,Primera Division,16,381,16.09.2022 19:00,Real Valladolid - Cadiz,0:1,Finished
4,1748218,Segunda División,707,381,16.09.2022 19:00,Levante - Cartagena,0:1,Finished
...,...,...,...,...,...,...,...,...
24288,1607985,Landesliga,475,310,28.05.2022 15:30,Volders - Münster,0:1,Finished
24289,1607984,Landesliga,475,310,28.05.2022 15:30,Innsbrucker AC - Mils,3:2,Finished
24290,1607740,Landesliga,475,310,28.05.2022 16:00,St. Jakob - SAK Klagenfurt,0:1,Finished
24291,1607990,Landesliga,475,310,28.05.2022 16:30,Natters - Prutz / Serfaus,3:2,Finished


In [65]:
# Clearing data
df_games = df[(df.game_status == 'Finished') & (df.game_utc.str.len() > 10)].copy()
# Check for dublicates
df_games.drop(columns=['game_status'], inplace=True)
df_games.drop_duplicates(subset=['game_id'], inplace=True)
df_games.drop_duplicates(subset=['game_utc', 'game_title'], inplace=True)

In [66]:
df_games

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals
0,1736289,Premier League,12,381,16.09.2022 19:00,Aston Villa - Southampton,1:0
1,1736296,Premier League,12,381,16.09.2022 19:00,Nottingham - Fulham,2:3
2,1747456,Serie A,15,381,16.09.2022 18:45,Salernitana - Lecce,1:2
3,1744353,Primera Division,16,381,16.09.2022 19:00,Real Valladolid - Cadiz,0:1
4,1748218,Segunda División,707,381,16.09.2022 19:00,Levante - Cartagena,0:1
...,...,...,...,...,...,...,...
24288,1607985,Landesliga,475,310,28.05.2022 15:30,Volders - Münster,0:1
24289,1607984,Landesliga,475,310,28.05.2022 15:30,Innsbrucker AC - Mils,3:2
24290,1607740,Landesliga,475,310,28.05.2022 16:00,St. Jakob - SAK Klagenfurt,0:1
24291,1607990,Landesliga,475,310,28.05.2022 16:30,Natters - Prutz / Serfaus,3:2


In [67]:
top_lgs = TableConstant.top_ligues
sec_lgs = TableConstant.second_ligues

df_games['ligue'] = np.where(df_games.comp_id.isin(top_lgs), 'top', np.where(df_games.comp_id.isin(sec_lgs), 'sec', 'thr'))

In [68]:
df_games.ligue.value_counts()

thr    16689
sec     4714
top     2889
Name: ligue, dtype: int64

In [69]:
df_games.reset_index(inplace=True, drop=True)
df_games.shape

(24292, 8)

In [70]:
df_games

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,ligue
0,1736289,Premier League,12,381,16.09.2022 19:00,Aston Villa - Southampton,1:0,top
1,1736296,Premier League,12,381,16.09.2022 19:00,Nottingham - Fulham,2:3,top
2,1747456,Serie A,15,381,16.09.2022 18:45,Salernitana - Lecce,1:2,top
3,1744353,Primera Division,16,381,16.09.2022 19:00,Real Valladolid - Cadiz,0:1,top
4,1748218,Segunda División,707,381,16.09.2022 19:00,Levante - Cartagena,0:1,top
...,...,...,...,...,...,...,...,...
24287,1607985,Landesliga,475,310,28.05.2022 15:30,Volders - Münster,0:1,thr
24288,1607984,Landesliga,475,310,28.05.2022 15:30,Innsbrucker AC - Mils,3:2,thr
24289,1607740,Landesliga,475,310,28.05.2022 16:00,St. Jakob - SAK Klagenfurt,0:1,thr
24290,1607990,Landesliga,475,310,28.05.2022 16:30,Natters - Prutz / Serfaus,3:2,thr


In [72]:
# file = open('pickle_files/new_events/df_16_09_2022_to_26_05_2022_games', 'wb')
# pickle.dump(df_games, file)  
# file.close()

### Get events data

In [50]:
# with open('pickle_files/new_events/02_11_2013/df_02_11_2013_to_10_07_2005_games', 'rb') as f:
#     df_load = pickle.load(f)

In [150]:
# df_load

In [151]:
# tls_one_ids_02_11_13 = ['12', '13', '14', '15', '16', '17', '18', '419', '420']
# tls_two_ids_02_11_13 = ['454', '456', '474', '483', '485', '550', '554', '560', '565']
# tls_three_ids_02_11_13 =  ['577', '581', '587', '591','596', '601', '677', '681'] 
# tls_four_ids_02_11_13 = ['684', '695', '699', '707', '712', '716', '723', '727'] 

In [152]:
# tls_new_ids_26_03_21 = tls_one_new_ids_26_03_21 + tls_two_new_ids_26_03_21 + tls_three_new_ids_26_03_21 + tls_four_new_ids_26_03_21

In [153]:
# sls_one_new_ids_26_03_21 = [ '419', '424', '435', '436', '437', '444', '446', '449', '450', '457', '459', '464', '473', '477', '491']
# sls_two_new_ids_26_03_21  = ['493', '496', '498', '500', '502', '504', '507', '512', '516', '518', '523', '532', '539', '540', '542']
# slm_one_new_ids_26_03_21 = [ '544', '545', '546', '548', '551', '555', '562', '567', '576', '585', '586', '589', '595', '599', '602']
# slm_two_new_ids_26_03_21 =  ['606', '621', '622', '625', '626', '628', '632', '634', '636', '637', '642', '646', '647', '648', '653']
# sle_one_new_ids_26_03_21 = ['654', '655', '657', '659', '660', '667', '672', '674', '675', '676', '679', '685', '687', '691', '692', '697']
# sle_two_new_ids_26_03_21 = ['700', '703', '704', '714', '715', '720', '721', '725', '727', '733''735', '747', '761', '912', '1157', '1259'] 

In [154]:
# sls_new_ids_26_03_21 = sls_one_new_ids_26_03_21 + sls_two_new_ids_26_03_21 + slm_one_new_ids_26_03_21 + slm_two_new_ids_26_03_21 +\
#                        sle_one_new_ids_26_03_21 + sle_two_new_ids_26_03_21

In [155]:
# tls_sls_new_ids_26_03_21 = tls_new_ids_26_03_21 + sls_new_ids_26_03_21

In [156]:
# leagues      =  tls_four_ids_02_11_13
# leagues_name = 'tls_four_ids_02_11_13'

In [157]:
# game_ids = df_load.game_id[df_load.comp_id.isin(leagues)].to_list()
# len(game_ids)

In [158]:
# Aftar start script 'HtmlParser.find_game_events' all events save every 500 ids in files, in last save file has all events save before.

In [159]:
# HtmlParser.find_game_events(game_ids, name_saved_file = leagues_name)# 

# For check for last saved game_id
# game_ids.index('1574921') 
# HtmlParser.find_game_events(game_ids[1500:], name_saved_file = leagues_name)

In [160]:
# with open('pickle_files/new_events/26_03_2021/events_old_ids/second_ligues_middle_2_10884423-Copy1', 'rb') as f:
#     events_games = pickle.load(f)

In [161]:
# From_02_11_2013_to_10_07_2005_games
# first id   count     name               last id
# 10886833 - 501  -  tls_four_ids_02_11_13_10909409

In [162]:
# # 'teams_ids' column not in all events files
# df_pickle = pd.DataFrame(events_games)
# df_events = ParsingDataPrepare.transform_columns_to_rows(df_pickle, ['teams_ids', 'event_mins', 'event_desc', 'stats_dict', 
#                                                                      'city_country', 'viewers', 'weath_temp', 'bets'])

# df_events.reset_index(inplace=True)
# df_events.rename(columns={'index':'game_id'}, inplace=True)
# df_events.shape

In [163]:
# df_events.shape

In [164]:
# df_events.head()

In [165]:
# Unite parsing events files in dataframe:

In [166]:
# pickle_files/new_events/26_05_2022/events_ids/, 
# path = 'pickle_files/new_events/26_05_2022/events_ids/'
# all_events_games = list()

# for file in os.listdir(path):
#     with open(path + file, 'rb') as f:
#         events_games = pickle.load(f)
#         all_events_games.append(events_games)

In [167]:
# len(all_events_games)

In [168]:
# # 'teams_ids' only for data after got 26_05_2022
# df_list   = []
# col_names = ['teams_ids', 'event_mins', 'event_hts_ats', 'stats_dict', 'city_country', 'viewers', 'weath_temp', 'bet_coefs']

# for game in all_events_games:
#     df_to_transform = pd.DataFrame(game)
#     df_create = ParsingDataPrepare.transform_columns_to_rows(df_to_transform, col_names)
    
#     df_list.append(df_create)

# df_events = pd.concat(df_list) 

In [169]:
# df_events = df_events.reset_index().rename(columns={'index': 'game_id'})

In [170]:
# df_events.head(1)

In [171]:
# file = open('pickle_files/new_events/df_evns_26_05_2022_to_27_03_2021', 'wb')
# pickle.dump(df_events, file)  
# file.close()