In [1]:
import pyexasol
import pandas as pd
import boto3
import time
import io
import numpy as np
from matplotlib import pyplot as plt

class QueryAthena:

    def __init__(self, query, database):
        self.database = 'btb-serving'
        self.folder = 'api/'
        self.bucket = 'btb-athena-query-results'
        self.s3_input = 's3://' + self.bucket + '/my_folder_input'
        self.s3_output =  's3://' + self.bucket + '/' + self.folder
        self.region_name = 'eu-central-1'
        self.query = query
        self.access_key = '****'
        self.secret_key = '***'

    def load_conf(self, q):
        
        try:
            self.client = boto3.client('athena', region_name = self.region_name, aws_access_key_id=self.access_key,
                                  aws_secret_access_key=self.secret_key)
            
            response = self.client.start_query_execution(
                QueryString = q,
                    QueryExecutionContext={
                    'Database': self.database
                    },
                    ResultConfiguration={
                    'OutputLocation': self.s3_output,
                    }
            )
            self.filename = response['QueryExecutionId']
            print('Execution ID: ' + response['QueryExecutionId'])

        except Exception as e:
            print(e)
        return response                

    def run_query(self):
        queries = [self.query]
        for q in queries:
            res = self.load_conf(q)
        try:              
            query_status = None
            while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
                query_status = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']['State']
                query_status2 = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']
                print(query_status2)
                if query_status == 'FAILED' or query_status == 'CANCELLED':
                    raise Exception('Athena query with the string "{}" failed or was cancelled'.format(self.query))
                time.sleep(0.2)
            print('Query "{}" finished.'.format(self.query))

            df = self.obtain_data()
            return df

        except Exception as e:
            print(e)      

    def obtain_data(self):
        try:
            self.resource = boto3.resource('s3', 
                                  region_name = self.region_name,
                                  aws_access_key_id=self.access_key,
                                  aws_secret_access_key=self.secret_key)
                                  
            print(self.bucket)
            print(self.folder)
            print(self.filename)
            
            # S3 Object
            res_obj = self.resource.Object(bucket_name=self.bucket, key=self.folder + self.filename + '.csv')
            response = res_obj.get()
            
            return pd.read_csv(io.BytesIO(response['Body'].read()), encoding='utf8')   
        except Exception as e:
            print(e) 

## data function to get home and away data

In [32]:
v_sql = """
select
    round(home_goals_for_ema5,1)            home_goals_for_ema5,
    round(home_goals_against_ema5,1)        home_goals_against_ema5,
    round(away_goals_for_ema5,1)            away_goals_for_ema5,
    round(away_goals_against_ema5,1)        away_goals_against_ema5,
    round(home_xg_for_ema5,1)               home_xg_for_ema5,
    round(home_xg_against_ema5,1)           home_xg_against_ema5,
    round(away_xg_for_ema5,1)               away_xg_for_ema5,
    round(away_xg_against_ema5,1)           away_xg_against_ema5,
    round(home_shots_for_ema5,1)               home_shots_for_ema5,
    round(home_shots_against_ema5,1)           home_shots_against_ema5,
    round(away_shots_for_ema5,1)               away_shots_for_ema5,
    round(away_shots_against_ema5,1)           away_shots_against_ema5,
    round(home_sot_for_ema5,1)               home_sot_for_ema5,
    round(home_sot_against_ema5,1)           home_sot_against_ema5,
    round(away_sot_for_ema5,1)               away_sot_for_ema5,
    round(away_sot_against_ema5,1)           away_sot_against_ema5,
    round(home_corners_for_ema5,1)               home_corners_for_ema5,
    round(home_corners_against_ema5,1)           home_corners_against_ema5,
    round(away_corners_for_ema5,1)               away_corners_for_ema5,
    round(away_corners_against_ema5,1)           away_corners_against_ema5,
    round(home_deep_for_ema5,1)               home_deep_for_ema5,
    round(home_deep_against_ema5,1)           home_deep_against_ema5,
    round(away_deep_for_ema5,1)               away_deep_for_ema5,
    round(away_deep_against_ema5,1)           away_deep_against_ema5,
    round(home_ppda_for_ema5,1)               home_ppda_for_ema5,
    round(home_ppda_against_ema5,1)           home_ppda_against_ema5,
    round(away_ppda_for_ema5,1)               away_ppda_for_ema5,
    round(away_ppda_against_ema5,1)           away_ppda_against_ema5,
    full_time_home_goals               full_time_home_goals
from
    "ml_poisson_features"
where
    feature_type = 'EMA5 HA'
    and full_time_home_goals is not null
    and home_shots_for_ema5 is not null
    and away_shots_against_ema5 is not null
    """
    
qa = QueryAthena(query=v_sql, database='btb-serving')
df_data = qa.run_query()

Execution ID: 982b2e1f-7657-4665-a599-901587db812d
{'State': 'QUEUED', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 32, 38, 48000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 32, 38, 48000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 32, 38, 48000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 32, 38, 48000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 32, 38, 48000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 32, 38, 48000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 32, 38, 48000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 32, 38, 48000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.da

In [34]:

#get data for home expected goals prediction
def get_home_data():

    #get iterator datam
    
    v_sql = """
select
    round(home_goals_for_ema5,1)            home_goals_for_ema5,
    round(home_goals_against_ema5,1)        home_goals_against_ema5,
    round(away_goals_for_ema5,1)            away_goals_for_ema5,
    round(away_goals_against_ema5,1)        away_goals_against_ema5,
    round(home_xg_for_ema5,1)               home_xg_for_ema5,
    round(home_xg_against_ema5,1)           home_xg_against_ema5,
    round(away_xg_for_ema5,1)               away_xg_for_ema5,
    round(away_xg_against_ema5,1)           away_xg_against_ema5,
    round(home_shots_for_ema5,1)               home_shots_for_ema5,
    round(home_shots_against_ema5,1)           home_shots_against_ema5,
    round(away_shots_for_ema5,1)               away_shots_for_ema5,
    round(away_shots_against_ema5,1)           away_shots_against_ema5,
    round(home_sot_for_ema5,1)               home_sot_for_ema5,
    round(home_sot_against_ema5,1)           home_sot_against_ema5,
    round(away_sot_for_ema5,1)               away_sot_for_ema5,
    round(away_sot_against_ema5,1)           away_sot_against_ema5,
    round(home_corners_for_ema5,1)               home_corners_for_ema5,
    round(home_corners_against_ema5,1)           home_corners_against_ema5,
    round(away_corners_for_ema5,1)               away_corners_for_ema5,
    round(away_corners_against_ema5,1)           away_corners_against_ema5,
    round(home_deep_for_ema5,1)               home_deep_for_ema5,
    round(home_deep_against_ema5,1)           home_deep_against_ema5,
    round(away_deep_for_ema5,1)               away_deep_for_ema5,
    round(away_deep_against_ema5,1)           away_deep_against_ema5,
    round(home_ppda_for_ema5,1)               home_ppda_for_ema5,
    round(home_ppda_against_ema5,1)           home_ppda_against_ema5,
    round(away_ppda_for_ema5,1)               away_ppda_for_ema5,
    round(away_ppda_against_ema5,1)           away_ppda_against_ema5,
    full_time_home_goals               full_time_home_goals
from
    "ml_poisson_features"
where
    feature_type = 'EMA5 HA'
    and full_time_home_goals is not null
    and home_shots_for_ema5 is not null
    and away_shots_against_ema5 is not null
    """
    
    qa = QueryAthena(query=v_sql, database='btb-serving')
    df_data = qa.run_query()

    return df_data



#get data for away expected goals prediction
def get_away_data(self):

    #get iterator datam
    
        v_sql = """
select
    round(home_goals_for_ema5,1)            home_goals_for_ema5,
    round(home_goals_against_ema5,1)        home_goals_against_ema5,
    round(away_goals_for_ema5,1)            away_goals_for_ema5,
    round(away_goals_against_ema5,1)        away_goals_against_ema5,
    round(home_xg_for_ema5,1)               home_xg_for_ema5,
    round(home_xg_against_ema5,1)           home_xg_against_ema5,
    round(away_xg_for_ema5,1)               away_xg_for_ema5,
    round(away_xg_against_ema5,1)           away_xg_against_ema5,
    round(home_shots_for_ema5,1)               home_shots_for_ema5,
    round(home_shots_against_ema5,1)           home_shots_against_ema5,
    round(away_shots_for_ema5,1)               away_shots_for_ema5,
    round(away_shots_against_ema5,1)           away_shots_against_ema5,
    round(home_sot_for_ema5,1)               home_sot_for_ema5,
    round(home_sot_against_ema5,1)           home_sot_against_ema5,
    round(away_sot_for_ema5,1)               away_sot_for_ema5,
    round(away_sot_against_ema5,1)           away_sot_against_ema5,
    round(home_corners_for_ema5,1)               home_corners_for_ema5,
    round(home_corners_against_ema5,1)           home_corners_against_ema5,
    round(away_corners_for_ema5,1)               away_corners_for_ema5,
    round(away_corners_against_ema5,1)           away_corners_against_ema5,
    round(home_deep_for_ema5,1)               home_deep_for_ema5,
    round(home_deep_against_ema5,1)           home_deep_against_ema5,
    round(away_deep_for_ema5,1)               away_deep_for_ema5,
    round(away_deep_against_ema5,1)           away_deep_against_ema5,
    round(home_ppda_for_ema5,1)               home_ppda_for_ema5,
    round(home_ppda_against_ema5,1)           home_ppda_against_ema5,
    round(away_ppda_for_ema5,1)               away_ppda_for_ema5,
    round(away_ppda_against_ema5,1)           away_ppda_against_ema5,
    full_time_away_goals                  full_time_home_goals
from
    "ml_poisson_features"
where
    feature_type = 'EMA5 HA'
    and full_time_home_goals is not null
    and home_shots_for_ema5 is not null
    and away_shots_against_ema5 is not null
    """
    
    qa = QueryAthena(query=v_sql, database='btb-serving')
    df_data = qa.run_query()

    return df_data


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 98)

## Grid search for home expected goals

In [None]:

#
#
# logistict regression
# Grid Search
#

import pandas
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression


v_sql = """
select
    round(home_goals_for_ema5,1)            home_goals_for_ema5,
    round(home_goals_against_ema5,1)        home_goals_against_ema5,
    round(away_goals_for_ema5,1)            away_goals_for_ema5,
    round(away_goals_against_ema5,1)        away_goals_against_ema5,
    round(home_xg_for_ema5,1)               home_xg_for_ema5,
    round(home_xg_against_ema5,1)           home_xg_against_ema5,
    round(away_xg_for_ema5,1)               away_xg_for_ema5,
    round(away_xg_against_ema5,1)           away_xg_against_ema5,
    round(home_shots_for_ema5,1)               home_shots_for_ema5,
    round(home_shots_against_ema5,1)           home_shots_against_ema5,
    round(away_shots_for_ema5,1)               away_shots_for_ema5,
    round(away_shots_against_ema5,1)           away_shots_against_ema5,
    round(home_sot_for_ema5,1)               home_sot_for_ema5,
    round(home_sot_against_ema5,1)           home_sot_against_ema5,
    round(away_sot_for_ema5,1)               away_sot_for_ema5,
    round(away_sot_against_ema5,1)           away_sot_against_ema5,
    round(home_corners_for_ema5,1)               home_corners_for_ema5,
    round(home_corners_against_ema5,1)           home_corners_against_ema5,
    round(away_corners_for_ema5,1)               away_corners_for_ema5,
    round(away_corners_against_ema5,1)           away_corners_against_ema5,
    round(home_deep_for_ema5,1)               home_deep_for_ema5,
    round(home_deep_against_ema5,1)           home_deep_against_ema5,
    round(away_deep_for_ema5,1)               away_deep_for_ema5,
    round(away_deep_against_ema5,1)           away_deep_against_ema5,
    round(home_ppda_for_ema5,1)               home_ppda_for_ema5,
    round(home_ppda_against_ema5,1)           home_ppda_against_ema5,
    round(away_ppda_for_ema5,1)               away_ppda_for_ema5,
    round(away_ppda_against_ema5,1)           away_ppda_against_ema5,
    full_time_home_goals               full_time_home_goals
from
    "ml_poisson_features"
where
    feature_type = 'EMA5 HA'
    and full_time_home_goals is not null
    and home_shots_for_ema5 is not null
    and away_shots_against_ema5 is not null
    """
    
qa = QueryAthena(query=v_sql, database='btb-serving')
df_data = qa.run_query()

x = df_data.iloc[:,:28]
y = df_data.iloc[:,28]

#print(x.head(5))
#print(y.head(5))

# define models and parameters
model = LogisticRegression(max_iter=200)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [0.5, 0.1, 0.01, 0.005]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=59)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x, y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Execution ID: d4e5cc60-dfc0-4e86-9672-778fc167d2d5
{'State': 'QUEUED', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 33, 35, 600000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 33, 35, 600000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 33, 35, 600000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 33, 35, 600000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 33, 35, 600000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 33, 35, 600000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 33, 35, 600000, tzinfo=tzlocal())}
{'State': 'RUNNING', 'SubmissionDateTime': datetime.datetime(2022, 5, 27, 21, 33, 35, 600000, tzinfo=tzlocal())}
{'State': 'SUCCEEDED', 'SubmissionDateTime': d



## Grid search for away expected goals

In [18]:

#
#
# logistict regression
# Grid Search
#

import pandas
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression


df_data = get_away_data()

x = df_data.iloc[:,:28]
y = df_data.iloc[:,28]

#print(x.head(5))
#print(y.head(5))

# define models and parameters
model = LogisticRegression(max_iter=200)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [0.5, 0.1, 0.01, 0.005]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=59)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x, y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

NameError: name 'get_away_data' is not defined