In [1]:
import keras


Using TensorFlow backend.


# Dataset Preprocessing


### Einlesen der Daten aus dem JSON der BBL

In [2]:
import urllib.request, json 
with urllib.request.urlopen("http://statistik.easycredit-bbl.de/XML/exchange/540/Schedule.php?type=json&saison=2017&fixedGamesOnly=0") as url:
    games = json.loads(url.read().decode())
    print(json.dumps(games, indent=4, sort_keys=True))
    

{
    "competition": [
        {
            "@attributes": {
                "ID": "1",
                "title": "easyCredit BBL Hauptrunde"
            },
            "spiel": [
                {
                    "arenaLat": "49.77337",
                    "arenaLon": "9.93923",
                    "arenaName": "S.Oliver-Arena",
                    "bbl_spielID": "20826",
                    "datum": "2017-09-29",
                    "gast": "Brose Bamberg",
                    "gastCity": "Brose Bamberg",
                    "gast_id": "420",
                    "gast_result": "73",
                    "home": "s.Oliver W\u00fcrzburg",
                    "homeCity": "W\u00fcrzburg",
                    "home_id": "540",
                    "home_result": "76",
                    "init_url": "http://live.beko-bbl.de/data/bbl/540/20826.JSN",
                    "live_url": "http://live.beko-bbl.de/data/bbl/540/20826.JSN",
                    "spiel_nummer": "0",
                 

### Daten aufbereiten

#### Erstellen einer Liste für die Arenen & Teams

In [None]:
arena=[]
home_ids=[]
for i in range(0,len(games['competition'][0]['spiel'])):
               
    if games['competition'][0]['spiel'][i]['home_id'] not in home_ids:
        arena.append(games['competition'][0]['spiel'][i]['arenaName'])
        home_ids.append(games['competition'][0]['spiel'][i]['home_id'])

#### Datum + Uhrzeit auslesen


In [None]:
from datetime import datetime

datetime_object = datetime.strptime(games['competition'][0]['spiel'][0]['datum']+" "+games['competition'][0]['spiel'][0]['uhrzeit']  , '%Y-%m-%d %H:%M:%S')

print(datetime_object)
print(datetime_object.strftime('%U')) #Kalendarwoche
print(datetime_object.strftime('%w')) #Wochentag


#### Erstellen eines dictionary für die Hallenkapazitäten

In [None]:
arenakap = {486:6594,413:14500,433:4200,420:6150,415:6000,425:3300,430:6000,426:5002,540:3140,418:6200,421:4003,422:3603,483:3076,477:3447,428:3000,439:4200,517:3533,432:3132}
print(arenakap)
print(len(arenakap))


### Dataset zusammenstellen 

In [None]:
dataset=[]
calendarWeeks=[]
weekDays=[]

for i in range(0,len(games['competition'][0]['spiel'])):
    datasetrow=[]     
    datasetrow.append(games['competition'][0]['spiel'][i]['home_id'])
    datasetrow.append(games['competition'][0]['spiel'][i]['gast_id'])
    datasetrow.append(int(games['competition'][0]['spiel'][i]['home_result']>games['competition'][0]['spiel'][i]['gast_result']))
    datasetrow.append(int(games['competition'][0]['spiel'][i]['zuschauer']))
    datasetrow.append(arenakap[int(games['competition'][0]['spiel'][i]['home_id'])])
    datetime_object = datetime.strptime(games['competition'][0]['spiel'][i]['datum']+" "+games['competition'][0]['spiel'][i]['uhrzeit']  , '%Y-%m-%d %H:%M:%S')
    kw = datetime_object.strftime('%U')
    datasetrow.append(datetime_object.strftime('%U'))
    if kw not in calendarWeeks:
        calendarWeeks.append(kw)
    days = datetime_object.strftime('%w')
    datasetrow.append(datetime_object.strftime('%w'))
    if days not in weekDays:
        weekDays.append(days)
    
    dataset.append(datasetrow)

print(dataset)
#print(calendarWeeks)
#print(weekDays)
#Eingangsneuronen sind abhängig vom Dataset --> KW (32) Days (7)

#### Umwandlung des Datasets in ein Numpy Array 

In [None]:
import numpy as np
dataset=np.asarray(dataset)
print(dataset[:,0]) # : -> auslesen aller zeilen
print(len(dataset))

#### One hot encoding der Teams, Wochentage, Kalendarwochen



In [None]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
transformed_home_ids = encoder.fit_transform(dataset[:,0])

print(transformed_home_ids)

In [None]:
transformed_gast_ids = encoder.transform(dataset[:,1]) #ohne fit, damit die Teams eindeutig bleiben, nur transformation notwendig
print(transformed_gast_ids)

In [None]:
transformed_calendarWeek = encoder.fit_transform(dataset[:,5])
#print(len(transformed_calendarWeek[0]))
transformed_weekDay = encoder.fit_transform(dataset[:,6])
#print(len(transformed_weekDay[0]))

In [None]:
#Umformung der Zuschauer in eine Spalte (vorher war es nur eine Zeile)
#print(np.reshape(dataset[:,3],(306,1)))
# -> siehe beim Featurescaling

#### Featurescaling der Zuschaueranzahl & Hallenkapazitäten

In [None]:
from sklearn.preprocessing import MinMaxScaler

arenaKap_scaler=MinMaxScaler()
arenaKap_scaler.fit([[0],[14500]]) #Maximum Berlin und 0 Minimum
#reshaping
transformed_zuschauer=arenaKap_scaler.transform(np.reshape(dataset[:,3],(306,1)))
transformed_kap=arenaKap_scaler.transform(np.reshape(dataset[:,4],(306,1)))
print(transformed_kap)

### Zusammenfügen der einzelnen Spalten home_ids, gast_ids,  zuschauer, Hallenkapazität, home_win, calendarWeek, weekDay und shuffeln der Daten

In [None]:
data=np.c_[transformed_home_ids,transformed_gast_ids,dataset[:,2]]#transformed_zuschauer,transformed_kap,transformed_calendarWeek,transformed_weekDay,
#np.random.shuffle(data)

In [None]:
print(len(data[0]))
neuronen = len(data[0])-1

# Netz Modellierung

In [None]:
# Importing the Keras libraries and packages 
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers
from sklearn.model_selection import StratifiedKFold

adam = optimizers.Adam(lr=0.001) #lernrate

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
cvscores = []
X=data[:,0:neuronen]
Y=data[:,neuronen]
for train, test in kfold.split(X, Y):

    # Initialising the ANN
    regressor = Sequential()

    # Adding the input layer and the first hidden layer
    regressor.add(Dense(units = neuronen, kernel_initializer = 'uniform', activation = 'relu', input_shape = (neuronen,)))

    # Adding the second hidden layer
    regressor.add(Dense(units = 18, kernel_initializer = 'uniform', activation = 'relu',))


    # Adding the output layer
    regressor.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

    #Summary anzeigen
    #regressor.summary()

    # Compiling the ANN - wie soll es lernen
    regressor.compile(optimizer = adam, loss = 'mean_squared_error', metrics = ['accuracy'])#binary_crossentropy

    # Fitting the ANN to the Training set 
    #input = data[:,0:4] output= (data[:,4]
    history = regressor.fit(X[train], Y[train], batch_size = 10, epochs = 100, verbose=0)

    # evaluate the regressor
    scores = regressor.evaluate(X[test], Y[test], verbose=0)
    print("%s: %.2f%%" % (regressor.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

 