In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from google.colab import files
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [3]:
uploaded = files.upload()

Saving sports_survey_clean.csv to sports_survey_clean.csv


In [4]:
df = pd.read_csv('sports_survey_clean.csv')

In [5]:
#doing the train/test split with random state 555
#we will only be working with the training data for now, and will use the test data only after having selected the best model
fan_train, fan_test = train_test_split(df,
                                      shuffle = True,
                                      random_state = 555,
                                      test_size = .2)

In [6]:
#Further splitting the training data into training and validation
#To start off simple, the first model we're building is to classify our data with respect to VL1r4 (placed a bet at a casino etc.), based on VL2 (interest in betting) and Fan_magnitude
#We have since decided to drop VL2 as an input variable
x_vl2_fan_mag, x_val_vl2_fan_mag, y_vl1r4, y_val_vl1r4 = train_test_split(fan_train[['VL2', 'Fan_magnitude']],fan_train[['VL1r4']],
                                                                          test_size=0.2,
                                                                          random_state=555,
                                                                          stratify=fan_train[['VL1r4']])

In [7]:
#Here, we're building a binary classification neural network model with two hidden layers of dimension 10 each and sigmoid function as the activation function
#We chose two layers because the relationship between the features and output variables is 'simple' enough so that all the information can be captured with two layers
#We chose 10 nodes in each layer to have the number of nodes be comparable to the number of input variables, which in this case is 2
model_vl2_fan_mag_vl1r4 = Sequential([
    Dense(10, activation='relu', input_shape=(2,)),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')])

In [8]:
#Here we are compiling our model
model_vl2_fan_mag_vl1r4.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
#We now train the model using 50 epochs and record the accuracy score by testing the model on the validation set, for each epoch
history_vl2_fan_mag_vl1r4 = model_vl2_fan_mag_vl1r4.fit(x_vl2_fan_mag, y_vl1r4,
                    epochs=50,
                    batch_size=32,
                    validation_data=(x_val_vl2_fan_mag, y_val_vl1r4))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [13]:
#We now want to build a similar model with input variables D4 (income), S2 (age) and Fan_magnitude, which we have also decided are the best
#performing features, and output variable VL1r12 (purchased multi-game tickets)
x_D4_S2_fan_mag, x_val_D4_S2_fan_mag, y_vl1r12, y_val_vl1r12 = train_test_split(fan_train[['D4', 'Fan_magnitude', 'S2']],fan_train[['VL1r12']],
                                                                          test_size=0.2,
                                                                          random_state=555,
                                                                          stratify=fan_train[['VL1r12']])

In [11]:
#We build the model here
model_D4_S2_fan_mag_vl1r12 = Sequential([
    Dense(10, activation='relu', input_shape=(3,)),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')])

In [12]:
#And compile it here
model_D4_S2_fan_mag_vl1r12.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [14]:
#As in the previous case, we now train the model with 50 epochs and record the accuracy score on validation data
history_D4_S2_fan_mag_vl1r12 = model_D4_S2_fan_mag_vl1r12.fit(x_D4_S2_fan_mag, y_vl1r12,
                    epochs=50,
                    batch_size=32,
                    validation_data=(x_val_D4_S2_fan_mag, y_val_vl1r12))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
#After doing EDA, we decided on the following features and output variables
best_features = ['D4', 'S2', 'Fan_magnitude'] #these are the features
VL1s = ['VL1r1','VL1r2','VL1r4','VL1r5',     #these are the fan behaviours we want to be able to predict
       'VL1r7','VL1r10','VL1r11','VL1r12',
       'VL1r13','VL1r14']

In [17]:
#We now want to build a model to predict each of the VL1s as a variable dependent on 'best_features'
#Therefore, we write a function which takes in as argument the number of features and spits out a two layered ReLU NN with 10 nodes in both layers
def create_model(input_dim):
    model = Sequential([
        Dense(10, activation='relu', input_shape=(input_dim,)),
        Dense(10, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
#We are now incorporating stratified k-fold validation into the NN
#Number of folds = k = 5
k = 5
epochs = 50
batch_size = 10

#Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

#Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()

accuracies = []   #initialising an empty list, where we will be storing the accuracy score for each fold

#Doing a first run with features D4, S2, Fan_mag and output VL1r12

X = fan_train[best_features]   #this is the feature data we will be using for each of our models

for train_index, val_index in skf.split(X, fan_train[['VL1r12']]):
    # Split data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = fan_train[['VL1r12']].iloc[train_index], fan_train[['VL1r12']].iloc[val_index]

    # Standardize the data
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Create the model
    model = create_model(X_train.shape[1])

    # Train the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

    # Evaluate the model
    y_val_pred = (model.predict(X_val) > 0.5).astype("int32")
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracies.append(accuracy)

print(f'Average cross-validation accuracy: {np.mean(accuracies)}')  #we take the mean over the 5-folds and print that out

Average cross-validation accuracy: 0.9196529254476932


In [None]:
#Iterating over the list VL1s to generate and test models for each VL1, with inputs remaining the same: D4, S2, Fan_magnitude
for i, v in enumerate(VL1s):

    accuracies = []

    y = fan_train[v]

    for train_index, val_index in skf.split(X, y):
        # Split data
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Standardize the data
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        # Create the model
        model = create_model(X_train.shape[1])

        # Train the model
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

        # Evaluate the model
        y_val_pred = (model.predict(X_val) > 0.5).astype("int32")
        accuracy = accuracy_score(y_val, y_val_pred)
        accuracies.append(accuracy)

    print(f'Average cross-validation accuracy for {v}: {np.mean(accuracies)}')


Average cross-validation accuracy for VL1r1: 0.6570151268187817
Average cross-validation accuracy for VL1r2: 0.6414517348995707
Average cross-validation accuracy for VL1r4: 0.8003378582509557
Average cross-validation accuracy for VL1r5: 0.6491734084226511
Average cross-validation accuracy for VL1r7: 0.5972980803740101
Average cross-validation accuracy for VL1r10: 0.7278304637342909
Average cross-validation accuracy for VL1r11: 0.7711422753781342
Average cross-validation accuracy for VL1r12: 0.9184463616357261
Average cross-validation accuracy for VL1r13: 0.7807932571035043
Average cross-validation accuracy for VL1r14: 0.8429251055397542
