In [1]:
import pickle
import time
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import numpy as np
import re
import datetime
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

In [2]:
# number of col_names in list must be equal number of rows df
def transform_columns_to_rows(df: pd.DataFrame, col_names: list):
    pd_series = pd.DataFrame()
    for i, col in zip(range(df.shape[0]), col_names):
        pd_series[col] = df.stack()[i]
    return pd_series 

In [3]:
def transform_date(list_obj: list):
    now = datetime.datetime.now()
    dates = []
    for obj in list_obj:
        if re.search(r'[a-zA-Z]', obj):
            date = ''.join(re.findall(r'[a-zA-Z]+', obj))
        elif (len(obj) == 12) & (re.search(r'[a-zA-Z]', obj) is None):
            date = obj.replace(',', '.' + str(now.year))
        else: 
            date = obj[:6] + str(now.year)[:2] + obj[6:8] + obj[9:15]
        dates.append(date)
    return dates

In [4]:
def double_slice_list(list_obj: list, septener: str):
    k = 2
    even = list_obj[k-1::k]
    odd = list_obj[k-2::k]
    all_res = []
    for x, y in zip(odd, even):
        res = x + septener + y
        all_res.append(res)
    return all_res

In [5]:
def cut_part_of_string(str_obj: str, start_board: str, end_board: str):
    reg_str =  start_board +'(.*?)' + end_board 
    return re.findall(reg_str, str_obj)

In [6]:
def scrape_html(str_url: str): 
    html = requests.get(str_url).content
    soup = BeautifulSoup(html)
    
    all_matches_db = {}
    for each_tb in soup.find_all('div', {'class': 'live_comptt_bd'}):
        ligue_header = each_tb.find('div', {'class': 'block_header'}).get_text()
        ligue_header = ''.join(re.findall(r'\n (.*?)\n', ligue_header))  
        ligue_header =   'Friendly' if ligue_header == '' else ligue_header
        
        season_id = ''.join(cut_part_of_string(str(each_tb),'season_id=', '\''))
        if (season_id == '') & (ligue_header != 'Friendly'):
            season_id = 'Cup' 
        elif ligue_header == 'Friendly': 
            season_id = 'Friendly'
        
        comp_id = each_tb.get('id')[3:] 
        
        game_ids = [x.get('dt-id') for x in each_tb.find_all('a', {'class': 'game_link'})]
        
        game_titles = [x.get('title') for x in each_tb.find_all('a', {'class': 'game_link'})]
        
        game_times_utc = [x.get_text() for x in each_tb.find_all('span', {'class': 'size10'})]
        game_times_utc = transform_date(game_times_utc)
        
        game_statuses = [x if re.search('[a-zA-Z]', str(x)) else 'Finished' for x in game_times_utc]
        
        game_times_utc = [x[:16] for x in game_times_utc if re.search(r'[a-zA-Z]?', str(x))]
        
        all_goals = [x.get_text() for x in each_tb.find_all('div', {'class': 'gls'})]
        all_goals = double_slice_list(all_goals, ':')

        stages = [x.get_text() for x in each_tb.find_all('div', {'class': 'stage'})]
        
        matches = dict()
        for game_id, game_utc, game_title, goals, game_status in zip(game_ids, game_times_utc, game_titles, all_goals, game_statuses): 
            matches.update({game_id: [ligue_header, comp_id, season_id, game_utc, game_title, goals, game_status]})
            
        all_matches_db.update(matches)
        
    return all_matches_db

In [7]:
# 7-11-2013 last day with bet's data on soccer365, 2697 - days before now
def create_date_list(numdays: int, start_year: int, start_month: int, start_day: int):
    base = datetime.date(start_year, start_month, start_day)
    date_list = [base - datetime.timedelta(days=x) for x in range(numdays)]
    return date_list

In [8]:
# function take url without date
def parsing_write_by_date(numdays: int, start_year: int, start_month: int, start_day: int, start_url='https://soccer365.me/online/&date='):
    date_list = create_date_list(numdays, start_year, start_month, start_day)
    all_matches_db = {}
    for date in date_list:
        print(date)
        main_url = start_url + str(date)
        matches_db = scrape_html(main_url)
        all_matches_db.update(matches_db)
        time.sleep(3)
    all_matches = open('all_matches', 'wb')
    pickle.dump(all_matches_db, all_matches)  
    all_matches.close()
    return print('Data is saved')

In [10]:
# parsing_write_by_date(numdays=5, start_year=2021, start_month=3, start_day=24)

In [12]:
# with open('all_matches', 'rb') as f:
#     all_matches = pickle.load(f)

In [9]:
df_to_transform = pd.DataFrame(scrape_html('https://soccer365.me/online/&date=2020-03-20'))

In [10]:
df = transform_columns_to_rows(df_to_transform, ['ligue_header', 'comp_id', 'season_id', 'game_utc', 'game_title','goals', 'game_status'])

In [11]:
df.reset_index(inplace=True)
df.rename(columns={'index':'game_id'}, inplace=True)

In [13]:
# df.head()