In [14]:
import pyexasol
import pandas as pd
import boto3
import time
import io
import numpy as np
from matplotlib import pyplot as plt

class QueryAthena:

    def __init__(self, query, database):
        self.database = 'btb-serving'
        self.folder = 'api/'
        self.bucket = 'btb-athena-query-results'
        self.s3_input = 's3://' + self.bucket + '/my_folder_input'
        self.s3_output =  's3://' + self.bucket + '/' + self.folder
        self.region_name = 'eu-central-1'
        self.query = query
        self.access_key = '***'
        self.secret_key = '***'

    def load_conf(self, q):
        
        try:
            self.client = boto3.client('athena', region_name = self.region_name, aws_access_key_id=self.access_key,
                                  aws_secret_access_key=self.secret_key)
            
            response = self.client.start_query_execution(
                QueryString = q,
                    QueryExecutionContext={
                    'Database': self.database
                    },
                    ResultConfiguration={
                    'OutputLocation': self.s3_output,
                    }
            )
            self.filename = response['QueryExecutionId']
            print('Execution ID: ' + response['QueryExecutionId'])

        except Exception as e:
            print(e)
        return response                

    def run_query(self):
        queries = [self.query]
        for q in queries:
            res = self.load_conf(q)
        try:              
            query_status = None
            while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
                query_status = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']['State']
                query_status2 = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']
                print(query_status2)
                if query_status == 'FAILED' or query_status == 'CANCELLED':
                    raise Exception('Athena query with the string "{}" failed or was cancelled'.format(self.query))
                time.sleep(0.2)
            print('Query "{}" finished.'.format(self.query))

            df = self.obtain_data()
            return df

        except Exception as e:
            print(e)      

    def obtain_data(self):
        try:
            self.resource = boto3.resource('s3', 
                                  region_name = self.region_name,
                                  aws_access_key_id=self.access_key,
                                  aws_secret_access_key=self.secret_key)
                                  
            print(self.bucket)
            print(self.folder)
            print(self.filename)
            
            # S3 Object
            res_obj = self.resource.Object(bucket_name=self.bucket, key=self.folder + self.filename + '.csv')
            response = res_obj.get()
            
            return pd.read_csv(io.BytesIO(response['Body'].read()), encoding='utf8')   
        except Exception as e:
            print(e) 

def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):
    # Get Test Scores Mean and std for each grid search
    scores_mean = cv_results['mean_test_score']
    scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))

    scores_sd = cv_results['std_test_score']
    scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))

    # Plot Grid search scores
    _, ax = plt.subplots(1,1)

    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(grid_param_2):
        ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))

    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)
    ax.grid('on')

In [30]:

def get_training_data_home():

    #load training data based on provided filter
    v_sql = """
    select
        round(home_goals_for_ema5,1)            home_goals_for_ema5,
        round(home_goals_against_ema5,1)        home_goals_against_ema5,
        round(away_goals_for_ema5,1)            away_goals_for_ema5,
        round(away_goals_against_ema5,1)        away_goals_against_ema5,
        round(home_xg_for_ema5,1)               home_xg_for_ema5,
        round(home_xg_against_ema5,1)           home_xg_against_ema5,
        round(away_xg_for_ema5,1)               away_xg_for_ema5,
        round(away_xg_against_ema5,1)           away_xg_against_ema5,
        round(home_shots_for_ema5,1)               home_shots_for_ema5,
        round(home_shots_against_ema5,1)           home_shots_against_ema5,
        round(away_shots_for_ema5,1)               away_shots_for_ema5,
        round(away_shots_against_ema5,1)           away_shots_against_ema5,
        round(home_sot_for_ema5,1)               home_sot_for_ema5,
        round(home_sot_against_ema5,1)           home_sot_against_ema5,
        round(away_sot_for_ema5,1)               away_sot_for_ema5,
        round(away_sot_against_ema5,1)           away_sot_against_ema5,
        round(home_corners_for_ema5,1)               home_corners_for_ema5,
        round(home_corners_against_ema5,1)           home_corners_against_ema5,
        round(away_corners_for_ema5,1)               away_corners_for_ema5,
        round(away_corners_against_ema5,1)           away_corners_against_ema5,
        round(home_deep_for_ema5,1)               home_deep_for_ema5,
        round(home_deep_against_ema5,1)           home_deep_against_ema5,
        round(away_deep_for_ema5,1)               away_deep_for_ema5,
        round(away_deep_against_ema5,1)           away_deep_against_ema5,
        round(home_ppda_for_ema5,1)               home_ppda_for_ema5,
        round(home_ppda_against_ema5,1)           home_ppda_against_ema5,
        round(away_ppda_for_ema5,1)               away_ppda_for_ema5,
        round(away_ppda_against_ema5,1)           away_ppda_against_ema5,
        avg(full_time_home_goals)               full_time_home_goals
from
    "ml_poisson_features"
where
    feature_type = 'EMA5 HA'
    and full_time_home_goals is not null
    and home_shots_for_ema5 is not null
    and away_shots_against_ema5 is not null
    and SEASON in ('2015_2016','2016_2017','2017_2018','2018_2019','2019_2020','2020_2021')
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
    """
    
    qa = QueryAthena(query=v_sql, database='btb-serving')
    df_data = qa.run_query()
        
    return df_data


def get_training_data_away():

    #load training data based on provided filter
    v_sql = """
    select
        round(home_goals_for_ema5,1)            home_goals_for_ema5,
        round(home_goals_against_ema5,1)        home_goals_against_ema5,
        round(away_goals_for_ema5,1)            away_goals_for_ema5,
        round(away_goals_against_ema5,1)        away_goals_against_ema5,
        round(home_xg_for_ema5,1)               home_xg_for_ema5,
        round(home_xg_against_ema5,1)           home_xg_against_ema5,
        round(away_xg_for_ema5,1)               away_xg_for_ema5,
        round(away_xg_against_ema5,1)           away_xg_against_ema5,
        round(home_shots_for_ema5,1)               home_shots_for_ema5,
        round(home_shots_against_ema5,1)           home_shots_against_ema5,
        round(away_shots_for_ema5,1)               away_shots_for_ema5,
        round(away_shots_against_ema5,1)           away_shots_against_ema5,
        round(home_sot_for_ema5,1)               home_sot_for_ema5,
        round(home_sot_against_ema5,1)           home_sot_against_ema5,
        round(away_sot_for_ema5,1)               away_sot_for_ema5,
        round(away_sot_against_ema5,1)           away_sot_against_ema5,
        round(home_corners_for_ema5,1)               home_corners_for_ema5,
        round(home_corners_against_ema5,1)           home_corners_against_ema5,
        round(away_corners_for_ema5,1)               away_corners_for_ema5,
        round(away_corners_against_ema5,1)           away_corners_against_ema5,
        round(home_deep_for_ema5,1)               home_deep_for_ema5,
        round(home_deep_against_ema5,1)           home_deep_against_ema5,
        round(away_deep_for_ema5,1)               away_deep_for_ema5,
        round(away_deep_against_ema5,1)           away_deep_against_ema5,
        round(home_ppda_for_ema5,1)               home_ppda_for_ema5,
        round(home_ppda_against_ema5,1)           home_ppda_against_ema5,
        round(away_ppda_for_ema5,1)               away_ppda_for_ema5,
        round(away_ppda_against_ema5,1)           away_ppda_against_ema5,
        avg(full_time_away_goals)               full_time_away_goals
from
    "ml_poisson_features"
where
    feature_type = 'EMA5 HA'
    and full_time_home_goals is not null
    and home_shots_for_ema5 is not null
    and away_shots_against_ema5 is not null
    and SEASON in ('2015_2016','2016_2017','2017_2018','2018_2019','2019_2020','2020_2021')
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
    """
    
    qa = QueryAthena(query=v_sql, database='btb-serving')
    df_data = qa.run_query()
        
    return df_data


def get_pred_data():

    #load training data based on provided filter
    v_sql = """
    select
        football_match_his_lid,
        match_date,
        home_team,
        away_team,
        --model features
        round(home_goals_for_ema5,1)            home_goals_for_ema5,
        round(home_goals_against_ema5,1)        home_goals_against_ema5,
        round(away_goals_for_ema5,1)            away_goals_for_ema5,
        round(away_goals_against_ema5,1)        away_goals_against_ema5,
        round(home_xg_for_ema5,1)               home_xg_for_ema5,
        round(home_xg_against_ema5,1)           home_xg_against_ema5,
        round(away_xg_for_ema5,1)               away_xg_for_ema5,
        round(away_xg_against_ema5,1)           away_xg_against_ema5,
        round(home_shots_for_ema5,1)               home_shots_for_ema5,
        round(home_shots_against_ema5,1)           home_shots_against_ema5,
        round(away_shots_for_ema5,1)               away_shots_for_ema5,
        round(away_shots_against_ema5,1)           away_shots_against_ema5,
        round(home_sot_for_ema5,1)               home_sot_for_ema5,
        round(home_sot_against_ema5,1)           home_sot_against_ema5,
        round(away_sot_for_ema5,1)               away_sot_for_ema5,
        round(away_sot_against_ema5,1)           away_sot_against_ema5,
        round(home_corners_for_ema5,1)               home_corners_for_ema5,
        round(home_corners_against_ema5,1)           home_corners_against_ema5,
        round(away_corners_for_ema5,1)               away_corners_for_ema5,
        round(away_corners_against_ema5,1)           away_corners_against_ema5,
        round(home_deep_for_ema5,1)               home_deep_for_ema5,
        round(home_deep_against_ema5,1)           home_deep_against_ema5,
        round(away_deep_for_ema5,1)               away_deep_for_ema5,
        round(away_deep_against_ema5,1)           away_deep_against_ema5,
        round(home_ppda_for_ema5,1)               home_ppda_for_ema5,
        round(home_ppda_against_ema5,1)           home_ppda_against_ema5,
        round(away_ppda_for_ema5,1)               away_ppda_for_ema5,
        round(away_ppda_against_ema5,1)           away_ppda_against_ema5
    from
        "ml_poisson_features"
    where
        feature_type = 'EMA5 HA'
        and full_time_home_goals is not null
        and home_shots_for_ema5 is not null
        and away_shots_against_ema5 is not null
        and SEASON = '2021_2022'
    """       


    qa = QueryAthena(query=v_sql, database='btb-serving')
    df_data = qa.run_query()
        
    return df_data

In [31]:
import pandas
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


#get data to train home expGoals model
df_train_data = get_training_data_home()

#split features and pred classes
df_train_home_x = df_train_data.iloc[:,:28]
df_train_home_y = df_train_data.iloc[:,28]

#model = RandomForestRegressor()
model_home = RandomForestRegressor()
model_home.fit(df_train_home_x,df_train_home_y) 


#get data to train home expGoals model
df_train_data = get_training_data_away()

#split features and pred classes
df_train_away_x = df_train_data.iloc[:,:28]
df_train_away_y = df_train_data.iloc[:,28]

model_away = RandomForestRegressor()
model_away.fit(df_train_away_x,df_train_away_y) 


#get data for prediction
df_pred_data = get_pred_data()


#do prediction for home expected goals
df_pred_x = df_pred_data.iloc[:,4:32]
df_pred_y_home = model_home.predict(df_pred_x)
df_pred_y_away = model_away.predict(df_pred_x)


df_result_data = df_pred_data.iloc[:,:4]
df_result_data['feature type'] = 'EMA5'
df_result_data['home expected goals'] = pd.Series(df_pred_y_home)
df_result_data['away expected goals'] = pd.Series(df_pred_y_away)


print(df_result_data)


Execution ID: d86c5864-ce5b-414e-ad44-e607fd7eba07
{'State': 'QUEUED', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 11, 51, 993000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 11, 51, 993000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 11, 51, 993000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 11, 51, 993000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 11, 51, 993000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 11, 51, 993000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 11, 51, 993000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 11, 51, 993000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.da

{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 12, 40, 398000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 12, 40, 398000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 12, 40, 398000, tzinfo=tzlocal())}
{'State': 'SUCCEEDED', 'SubmissionDateTime': datetime.datetime(2022, 5, 31, 0, 12, 40, 398000, tzinfo=tzlocal()), 'CompletionDateTime': datetime.datetime(2022, 5, 31, 0, 12, 42, 664000, tzinfo=tzlocal())}
Query "
    select
        football_match_his_lid,
        match_date,
        home_team,
        away_team,
        --model features
        round(home_goals_for_ema5,1)            home_goals_for_ema5,
        round(home_goals_against_ema5,1)        home_goals_against_ema5,
        round(away_goals_for_ema5,1)            away_goals_for_ema5,
        round(away_goals_against_ema5,1)        away_goals_against_ema5,
        round(home_xg_for_ema5,1)        

In [28]:
df_result_data['home expected goals'] = pd.Series(df_pred_y)

print(df_result_data.head())

             football_match_his_lid  match_date home_team away_team  \
0  b1519b4b45bc69c52c9c75fe417a8173  2022-01-09      Roma  Juventus   
1  27e9a7a7a198237a8fa163a1b99c4720  2021-12-11   Udinese     Milan   
2  e42e2599a23454579c23bb5f5d5bfc1c  2022-05-01  West Ham   Arsenal   
3  60ee5200005cfc9d228e3ed1843d569a  2022-04-03  Valencia     Cadiz   
4  8fcaa77ec715ff48b795885f362b1580  2021-11-20     Lazio  Juventus   

  feature type  home expected goals  
0         EMA5                 1.25  
1         EMA5                 1.01  
2         EMA5                 1.54  
3         EMA5                 1.29  
4         EMA5                 1.95  
