In [1]:
import pickle
import pandas as pd
import numpy as np
import re
import os
import unicodedata

from _html_parser import ParsingDataPrepare

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', 500)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
all_events_games = list()
for file in os.listdir('pickle_files'):
    if re.search('-Copy1', file):
        with open('pickle_files/' + file, 'rb') as f:
            events_games = pickle.load(f)
            all_events_games.append(events_games)

In [4]:
len(all_events_games)

38

In [5]:
with open('pickle_files/top_ligues_start_1_11920200-Copy1', 'rb') as f:
    events_games = pickle.load(f)

In [6]:
df_to_transform = pd.DataFrame(events_games)

In [7]:
df = ParsingDataPrepare.transform_columns_to_rows(df_to_transform, ['event_mins', 'event_hts_ats', 'stats_dict', 
                                                                    'city_country', 'viewers', 'weath_temp', 'bet_coefs'])

In [8]:
df.reset_index(inplace=True)
df.rename(columns={'index':'game_id'}, inplace=True)

In [9]:
df.shape

(7995, 8)

In [11]:
df.head(1)

Unnamed: 0,game_id,event_mins,event_hts_ats,stats_dict,city_country,viewers,weath_temp,bet_coefs
0,14898157,"[8, 14, 25, 32, 35, 67, 70, 79, 83]","[goal, goal, yellowcard, pengoal, yellowcard, yellowcard, yellowcard, mispen, yellowcard]","[(Shots, 15, 16), (Shots on target, 7, 7), (Shots Blocked, 1, 0), (Saves, 5, 5), (Possession %, 59, 41), (Corner, 7, 10), (Fouls, 7, 17), (Offsides, 2, 3), (Yellow cards, 1, 4), (Red cards, 0, 0), (Counterattacks, 2, 2), (Free Kicks, 20, 9), (Goal Kicks, 7, 7), (Throw Ins, 22, 19), (Crosses, 15, 24), (Treatment, 2, 1)]","Paderborn, Germany",14217,"(+1, overcast)","([1, X, 2, ТМ 2.5, ТБ 2.5], [2.49, 3.66, 2.80, 3.20, 1.37])"


#### Calculate fist minute's goal

In [10]:
sr_goals = df[['event_mins', 'event_hts_ats']].apply(lambda x: {k:v for k, v in zip(x['event_mins'], x['event_hts_ats']) if re.findall('goal', v)}, axis=1)   

In [11]:
df['fg_minute'] = sr_goals.map(lambda x: next(iter(x.keys())) if x != {} else None)

#### Create cards columns

In [12]:
df['y_cards'] = df.stats_dict.map(lambda x: [int(i[1]) + int(i[2]) for i in x if re.findall('Yellow cards', next(iter(i)))])
df.y_cards = df.y_cards.map(lambda x: ''.join(map(str, x)))

In [13]:
df['r_cards'] = df.stats_dict.map(lambda x: [int(i[1]) + int(i[2]) for i in x if re.findall('Red cards', next(iter(i)))])
df.r_cards = df.r_cards.map(lambda x: ''.join(map(str, x)))

In [14]:
df.r_cards.value_counts()

0    6410
1    1204
      191
2     168
3      18
4       3
6       1
Name: r_cards, dtype: int64

In [15]:
df.y_cards  = df[['event_hts_ats', 'y_cards']].apply(lambda x: x['event_hts_ats'].count('yellowcard') if x['y_cards'] == '' else x['y_cards'], axis=1) 
df.r_cards  = df[['event_hts_ats', 'r_cards']].apply(lambda x: x['event_hts_ats'].count('redcard') if x['r_cards'] == '' else x['r_cards'], axis=1) 

In [16]:
df.y_cards = df.y_cards.astype(int)
df.r_cards = df.r_cards.astype(int)

In [17]:
df.r_cards.value_counts()

0    6584
1    1219
2     170
3      18
4       3
6       1
Name: r_cards, dtype: int64

In [18]:
df['card_coef'] = df.y_cards + df.r_cards*2

#### Calculate 'mispen' and 'owngoal' in event_hts_ats

In [19]:
df['mispen'] = df.event_hts_ats.map(lambda x: True if any(i == 'mispen' for i in x) else False)
df['owngoal'] = df.event_hts_ats.map(lambda x: True if any(i == 'owngoal' for i in x) else False)

In [20]:
df.mispen.value_counts()

False    7480
True      515
Name: mispen, dtype: int64

In [21]:
df.owngoal.value_counts()

False    7454
True      541
Name: owngoal, dtype: int64

#### Calculate bets coefficients

In [22]:
sr_bets = df.bet_coefs.map(lambda x:{i:v for i, v in enumerate(x[1])})                  

In [23]:
df_bets = pd.DataFrame(sr_bets)
df_bets.head(1)

Unnamed: 0,bet_coefs
0,"{0: '2.49', 1: '3.66', 2: '2.80', 3: '3.20', 4: '1.37'}"


In [24]:
df_bets = pd.DataFrame(df_bets['bet_coefs'].values.tolist(), index=df_bets.index)
df_bets.columns = ['1', 'X', '2', 'TM', 'TB']

In [25]:
df_bets.head(3)

Unnamed: 0,1,X,2,TM,TB
0,2.49,3.66,2.8,3.2,1.37
1,1.71,4.32,4.82,2.32,1.58
2,2.32,3.09,3.6,1.36,3.24


In [26]:
df[df_bets.columns] = df_bets

In [27]:
df.city_country.value_counts().head(5)

                          593
London, United Kingdom    262
Sao Paulo, Brazil         244
Rio de Janeiro, Brazil    195
Belo Horizonte, Brazil    149
Name: city_country, dtype: int64

#### Create country and city columns

In [28]:
df['city'] = df.city_country.map(lambda x: x.split(',')[0] if len(x.split(',')) > 1 else None)
df['country']  = df.city_country.map(lambda x: x.split(',')[1] if len(x.split(',')) > 1 else (x.split(',')[0] if x is not  None else None)) 

#### Merge df with games statistic with df_all_mdf_all_matches

In [29]:
col_names = ['game_id', 'fg_minute', 'y_cards', 'r_cards' ,'card_coef', 'mispen', 'owngoal', '1', 'X', '2', 'TM', 'TB', 'city', 'country']
df = df[col_names].copy()

In [30]:
with open('pickle_files/df_all_matches', 'rb') as f:
    df_all_matches = pickle.load(f)

In [31]:
df_matches = df_all_matches[df_all_matches.game_id.isin(df.game_id)]

In [32]:
df_matches.shape

(7995, 8)

In [33]:
df_matches = df_matches.merge(df, how='left', left_on='game_id', right_on='game_id').copy()

In [34]:
df_matches.head(3)

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,game_status,fg_minute,y_cards,r_cards,card_coef,mispen,owngoal,1,X,2,TM,TB,city,country
0,14898157,1. Bundesliga,17,300,05.10.2019 16:30,Paderborn - Mainz,1:2,Finished,8,5,0,5,True,False,2.49,3.66,2.8,3.2,1.37,Paderborn,Germany
1,14898156,1. Bundesliga,17,300,05.10.2019 19:30,Schalke - Köln,1:1,Finished,72,8,0,8,False,False,1.71,4.32,4.82,2.32,1.58,Gelsenkirchen,Germany
2,14900649,Premier Liga,13,300,05.10.2019 11:30,Ufa - Ahmat,0:1,Finished,9,4,0,4,False,False,2.32,3.09,3.6,1.36,3.24,Ufa,Russia


#### Create home_team and guest_team columns

In [35]:
df_matches['home_team'] = df_matches.game_title.map(lambda x: x.split('-')[0].rstrip())
df_matches['guest_team']  = df_matches.game_title.map(lambda x: x.split('-')[1].rstrip()) 

#### Load df_teams_data for geting teams city and country in empty cells

In [36]:
with open('pickle_files/df_teams_data', 'rb') as f:
    df_teams_data = pickle.load(f)

In [37]:
df_teams_data.head(3)

Unnamed: 0,club,club_country,club_city
0,ABC,Brazil,Natal
1,ACS Poli Timisoara,Romania,Timisoara
2,ADO Den Haag,Netherlands,Den Haag


##### Change no utf-8 symbols

In [38]:
def find_no_utf_symbols(sr: pd.Series):
    find_symols = r'[^.a-zA-Z0-9)(&"\s>;<=,-/}{\']'
    rare_symbols = sr.map(lambda x: ''.join(re.findall(find_symols, str(x))))
    return rare_symbols

In [39]:
no_utf_symbols = find_no_utf_symbols(df_matches.home_team)
str(set(no_utf_symbols))

"{'', 'ö', 'áá', 'ê', 'ó', 'ü', 'ñ', 'ã', 'á', 'é', 'ú'}"

In [40]:
def str_decode(str_obj: str):
    normalized = unicodedata.normalize('NFD', str_obj)
    decode_str = u"".join([c for c in normalized if not unicodedata.combining(c)])
    return decode_str

In [41]:
df_matches.home_team = df_matches.home_team.map(lambda x: str_decode(str(x))) 
df_matches.guest_team = df_matches.guest_team.map(lambda x: str_decode(str(x))) 

In [42]:
df_matches.head(3)

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,game_status,fg_minute,y_cards,r_cards,card_coef,mispen,owngoal,1,X,2,TM,TB,city,country,home_team,guest_team
0,14898157,1. Bundesliga,17,300,05.10.2019 16:30,Paderborn - Mainz,1:2,Finished,8,5,0,5,True,False,2.49,3.66,2.8,3.2,1.37,Paderborn,Germany,Paderborn,Mainz
1,14898156,1. Bundesliga,17,300,05.10.2019 19:30,Schalke - Köln,1:1,Finished,72,8,0,8,False,False,1.71,4.32,4.82,2.32,1.58,Gelsenkirchen,Germany,Schalke,Koln
2,14900649,Premier Liga,13,300,05.10.2019 11:30,Ufa - Ahmat,0:1,Finished,9,4,0,4,False,False,2.32,3.09,3.6,1.36,3.24,Ufa,Russia,Ufa,Ahmat


In [43]:
len(df_matches[df_matches.city.isna()].home_team.unique())

32

In [44]:
miss_city_teams = df_matches[df_matches.city.isna()].home_team.unique()
miss_city_teams

array(['Sao Bento', 'Parana', 'Vila Nova', 'Guarani Campinas', 'Oeste',
       'Brasil de Pelotas', 'Atletico Goianiense', 'Criciuma', 'Cuiaba',
       'Operario Ferroviario', 'Bragantino', 'CRB', 'Vitoria',
       'Botafogo SP', 'Nimes', 'Avai', 'Goias', 'Paysandu', 'Boa',
       'Juventude', 'Sampaio Correa', 'Figueirense', 'Ponte Preta',
       'Alagoano', 'Luverdense', 'ABC', 'Nautico', 'Ceara', 'Veres',
       'Shakhtar', 'Internacional', 'Santa Cruz'], dtype=object)

In [45]:
df_miss_city = pd.DataFrame()

for x in miss_city_teams:
    miss_city = df_teams_data[df_teams_data.club.str.contains(x, regex=False)]
    df_miss_city = pd.concat([df_miss_city, miss_city])

df_miss_city.reset_index(drop=True, inplace=True)

In [46]:
df_miss_city.head(3)

Unnamed: 0,club,club_country,club_city
0,Sao Bento,Brazil,Sorocaba
1,Athletico Paranaense,Brazil,Curitiba
2,Parana,Brazil,Curitiba


In [47]:
clubs_not_in_teams_data = np.setdiff1d(miss_city_teams, df_miss_city.club)
clubs_not_in_teams_data

array([], dtype=object)

In [48]:
df_matches = df_matches.merge(df_miss_city[['club', 'club_country', 'club_city']], how='left', left_on='home_team', right_on='club')

In [49]:
df_matches.club_city.value_counts().head()

Goiania          151
Campinas         102
Florianopolis    100
Curitiba         100
Maceio            81
Name: club_city, dtype: int64

In [50]:
df_matches.city = np.where(df_matches.city.isna(), df_matches.club_city, df_matches.city)
df_matches.country = np.where(df_matches.country.isna(), df_matches.club_country, df_matches.country)

In [51]:
df_matches[df_matches.city.isna()]

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,game_status,fg_minute,y_cards,r_cards,card_coef,mispen,owngoal,1,X,2,TM,TB,city,country,home_team,guest_team,club,club_country,club_city


In [52]:
df_matches[df_matches.country.isna()]

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,game_status,fg_minute,y_cards,r_cards,card_coef,mispen,owngoal,1,X,2,TM,TB,city,country,home_team,guest_team,club,club_country,club_city


In [53]:
df_matches.drop(columns=['club', 'club_country', 'club_city'], inplace=True)

### Get cities Longitude and Latitude

In [54]:
df_matches.head(3)

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,game_status,fg_minute,y_cards,r_cards,card_coef,mispen,owngoal,1,X,2,TM,TB,city,country,home_team,guest_team
0,14898157,1. Bundesliga,17,300,05.10.2019 16:30,Paderborn - Mainz,1:2,Finished,8,5,0,5,True,False,2.49,3.66,2.8,3.2,1.37,Paderborn,Germany,Paderborn,Mainz
1,14898156,1. Bundesliga,17,300,05.10.2019 19:30,Schalke - Köln,1:1,Finished,72,8,0,8,False,False,1.71,4.32,4.82,2.32,1.58,Gelsenkirchen,Germany,Schalke,Koln
2,14900649,Premier Liga,13,300,05.10.2019 11:30,Ufa - Ahmat,0:1,Finished,9,4,0,4,False,False,2.32,3.09,3.6,1.36,3.24,Ufa,Russia,Ufa,Ahmat


In [55]:
# df_matches['location'] = df_matches[['city', 'country']].apply(lambda x: geolocator.geocode(x['city']+','+ x['country']), axis=1)

In [56]:
len(df_matches.city.unique())

190