In [17]:
import pyexasol
import pandas as pd
import boto3
import time
import io
import numpy as np
from matplotlib import pyplot as plt

class QueryAthena:

    def __init__(self, query, database):
        self.database = 'btb-serving'
        self.folder = 'api/'
        self.bucket = 'btb-athena-query-results'
        self.s3_input = 's3://' + self.bucket + '/my_folder_input'
        self.s3_output =  's3://' + self.bucket + '/' + self.folder
        self.region_name = 'eu-central-1'
        self.query = query
        self.access_key = '***'
        self.secret_key = '***'

    def load_conf(self, q):
        
        try:
            self.client = boto3.client('athena', region_name = self.region_name, aws_access_key_id=self.access_key,
                                  aws_secret_access_key=self.secret_key)
            
            response = self.client.start_query_execution(
                QueryString = q,
                    QueryExecutionContext={
                    'Database': self.database
                    },
                    ResultConfiguration={
                    'OutputLocation': self.s3_output,
                    }
            )
            self.filename = response['QueryExecutionId']
            #print('Execution ID: ' + response['QueryExecutionId'])

        except Exception as e:
            print(e)
        return response                

    def run_query(self):
        queries = [self.query]
        for q in queries:
            res = self.load_conf(q)
        try:              
            query_status = None
            while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
                query_status = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']['State']
                query_status2 = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']
                #print(query_status2)
                if query_status == 'FAILED' or query_status == 'CANCELLED':
                    raise Exception('Athena query with the string "{}" failed or was cancelled'.format(self.query))
                time.sleep(0.2)
            #print('Query "{}" finished.'.format(self.query))

            df = self.obtain_data()
            return df

        except Exception as e:
            print(e)      

    def obtain_data(self):
        try:
            self.resource = boto3.resource('s3', 
                                  region_name = self.region_name,
                                  aws_access_key_id=self.access_key,
                                  aws_secret_access_key=self.secret_key)
                                  
            #print(self.bucket)
            #print(self.folder)
            #print(self.filename)
            
            # S3 Object
            res_obj = self.resource.Object(bucket_name=self.bucket, key=self.folder + self.filename + '.csv')
            response = res_obj.get()
            
            return pd.read_csv(io.BytesIO(response['Body'].read()), encoding='utf8')   
        except Exception as e:
            print(e) 

## data function to get iterator data

In [20]:

#get iterator month data
def get_feature_types():

    #get iterator datam
    
    v_sql = """
SELECT distinct
        feature_type
from
    "ml_poisson_features"
where
    full_time_home_goals is not null
    and home_shots_for_ema is not null
    and away_shots_against_ema is not null
    """
    
    qa = QueryAthena(query=v_sql, database='btb-serving')
    df_data = qa.run_query()

    return df_data


def get_home_data(feature_type_filter):
    
    v_sql = """
select
    round(home_goals_for_ema,1)            home_goals_for_ema,
    round(home_goals_against_ema,1)        home_goals_against_ema,
    round(away_goals_for_ema,1)            away_goals_for_ema,
    round(away_goals_against_ema,1)        away_goals_against_ema,
    round(home_xg_for_ema,1)               home_xg_for_ema,
    round(home_xg_against_ema,1)           home_xg_against_ema,
    round(away_xg_for_ema,1)               away_xg_for_ema,
    round(away_xg_against_ema,1)           away_xg_against_ema,
    round(home_shots_for_ema,1)               home_shots_for_ema,
    round(home_shots_against_ema,1)           home_shots_against_ema,
    round(away_shots_for_ema,1)               away_shots_for_ema,
    round(away_shots_against_ema,1)           away_shots_against_ema,
    round(home_sot_for_ema,1)               home_sot_for_ema,
    round(home_sot_against_ema,1)           home_sot_against_ema,
    round(away_sot_for_ema,1)               away_sot_for_ema,
    round(away_sot_against_ema,1)           away_sot_against_ema,
    round(home_corners_for_ema,1)               home_corners_for_ema,
    round(home_corners_against_ema,1)           home_corners_against_ema,
    round(away_corners_for_ema,1)               away_corners_for_ema,
    round(away_corners_against_ema,1)           away_corners_against_ema,
    round(home_deep_for_ema,1)               home_deep_for_ema,
    round(home_deep_against_ema,1)           home_deep_against_ema,
    round(away_deep_for_ema,1)               away_deep_for_ema,
    round(away_deep_against_ema,1)           away_deep_against_ema,
    round(home_ppda_for_ema,1)               home_ppda_for_ema,
    round(home_ppda_against_ema,1)           home_ppda_against_ema,
    round(away_ppda_for_ema,1)               away_ppda_for_ema,
    round(away_ppda_against_ema,1)           away_ppda_against_ema,
    avg(full_time_home_goals)               full_time_home_goals,
    avg(full_time_away_goals)               full_time_away_goals
from
    "ml_poisson_features"
where
    feature_type = '""" + str(feature_type_filter) + """' and
    home_goals_for_ema is not null and
    home_goals_against_ema is not null and
    away_goals_for_ema is not null and
    away_goals_against_ema is not null and
    home_xg_for_ema is not null and
    home_xg_against_ema is not null and
    away_xg_for_ema is not null and
    away_xg_against_ema is not null and
    home_shots_for_ema is not null and
    home_shots_against_ema is not null and
    away_shots_for_ema is not null and
    away_shots_against_ema is not null and
    home_sot_for_ema is not null and
    home_sot_against_ema is not null and
    away_sot_for_ema is not null and
    away_sot_against_ema is not null and
    home_corners_for_ema is not null and
    home_corners_against_ema is not null and
    away_corners_for_ema is not null and
    away_corners_against_ema is not null and
    home_deep_for_ema is not null and
    home_deep_against_ema is not null and
    away_deep_for_ema is not null and
    away_deep_against_ema is not null and
    home_ppda_for_ema is not null and
    home_ppda_against_ema is not null and
    away_ppda_for_ema is not null and
    away_ppda_against_ema is not null and
    full_time_home_goals is not null and
    full_time_away_goals is not null
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
    """
    
    
    qa = QueryAthena(query=v_sql, database='btb-serving')
    df_data = qa.run_query()
    
    return df_data

    
    


## data function to get home and away data

In [23]:

import pandas
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor



#get different feature types
df_iterator = get_feature_types()

print(df_iterator)

for index, row in df_iterator.iterrows():
    print('Loop for: ' + str(row['feature_type']))
    
    
    print('...getting data')
    
    #get data to train home expGoals model
    df_data = get_home_data(row['feature_type'])
    
    x = df_data.iloc[:,:28]
    y = df_data.iloc[:,29]

    #print(x.head(5))
    #print(y.head(5))
    
    print('...training models')

    # define models and parameters
    model = RandomForestRegressor()

    #home expected goals -> -1.498672
    #max_depth = [15]
    #min_samples_leaf = [10]
    #min_samples_split = [150]
    #max_leaf_nodes = [100]
    #n_estimators = [600]
    #max_features = [10]

    #away expected goals -> -1,24
    max_depth = [15]
    min_samples_leaf = [10]
    min_samples_split = [150]
    max_leaf_nodes = [100]
    n_estimators = [500]
    max_features = [10]


    # define grid search
    grid = dict(max_depth=max_depth, 
                min_samples_leaf=min_samples_leaf, 
                min_samples_split=min_samples_split, 
                max_leaf_nodes=max_leaf_nodes, 
                n_estimators=n_estimators,
               max_features=max_features)

    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    #grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error',error_score=0)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_absolute_error',error_score=0)
    grid_result = grid_search.fit(x, y)

    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    

  feature_type
0   EMA20 COMB
1   EMA10 COMB
2      EMA5 HA
3     EMA10 HA
4   EMA15 COMB
5    EMA5 COMB
6     EMA20 HA
7     EMA15 HA
Loop for: EMA20 COMB
...getting data
...training models




Best: -0.855821 using {'max_depth': 15, 'max_features': 10, 'max_leaf_nodes': 100, 'min_samples_leaf': 10, 'min_samples_split': 150, 'n_estimators': 500}
Loop for: EMA10 COMB
...getting data
...training models




Best: -0.858795 using {'max_depth': 15, 'max_features': 10, 'max_leaf_nodes': 100, 'min_samples_leaf': 10, 'min_samples_split': 150, 'n_estimators': 500}
Loop for: EMA5 HA
...getting data
...training models




Best: -0.864313 using {'max_depth': 15, 'max_features': 10, 'max_leaf_nodes': 100, 'min_samples_leaf': 10, 'min_samples_split': 150, 'n_estimators': 500}
Loop for: EMA10 HA
...getting data
...training models




Best: -0.859257 using {'max_depth': 15, 'max_features': 10, 'max_leaf_nodes': 100, 'min_samples_leaf': 10, 'min_samples_split': 150, 'n_estimators': 500}
Loop for: EMA15 COMB
...getting data
...training models




Best: -0.856964 using {'max_depth': 15, 'max_features': 10, 'max_leaf_nodes': 100, 'min_samples_leaf': 10, 'min_samples_split': 150, 'n_estimators': 500}
Loop for: EMA5 COMB
...getting data
...training models




Best: -0.864090 using {'max_depth': 15, 'max_features': 10, 'max_leaf_nodes': 100, 'min_samples_leaf': 10, 'min_samples_split': 150, 'n_estimators': 500}
Loop for: EMA20 HA
...getting data
...training models




Best: -0.857201 using {'max_depth': 15, 'max_features': 10, 'max_leaf_nodes': 100, 'min_samples_leaf': 10, 'min_samples_split': 150, 'n_estimators': 500}
Loop for: EMA15 HA
...getting data
...training models




Best: -0.857258 using {'max_depth': 15, 'max_features': 10, 'max_leaf_nodes': 100, 'min_samples_leaf': 10, 'min_samples_split': 150, 'n_estimators': 500}
