In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import data into Pandas Dataframes. 
# By setting header=None, we ensure pandas does not interpret the first row as headers.
train_df = pd.read_csv("dota2Train.csv",header=None)
test_df = pd.read_csv("dota2Test.csv",header=None)

In [3]:
# Rename the Dataframe Columns, as x1,x2,...
# Keep in mind that the first column is the target
train_df.columns = ["target"] + [f"x{i}" for i in range(1,train_df.shape[1])]
test_df.columns = ["target"] + [f"x{i}" for i in range(1,test_df.shape[1])]

In [4]:
# Display the first 5 instances, to check if everything works fine
train_df.head()

Unnamed: 0,target,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116
0,-1,223,2,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,152,2,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
2,1,131,2,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
3,1,154,2,2,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
4,-1,171,2,3,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from sklearn.model_selection import train_test_split
import numpy as np
# Let's now create the validation set, picking randomly from the train test, so we can evaluate the model.
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Run this once, because it removes 'target' column
train_labels = np.array(train_df.pop('target'))
val_labels = np.array(val_df.pop('target'))
test_labels = np.array(test_df.pop('target'))

# Make target in range 0 to 1
train_labels = (train_labels == 1).astype(int)
val_labels = (val_labels == 1).astype(int)
test_labels = (test_labels == 1).astype(int)

In [6]:
train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_features = scaler.fit_transform(train_features)
val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

In [8]:
print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

Training labels shape: (74120,)
Validation labels shape: (18530,)
Test labels shape: (10294,)
Training features shape: (74120, 116)
Validation features shape: (18530, 116)
Test features shape: (10294, 116)


In [9]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
model = Sequential()
model.add(Dense(256, activation='relu', input_shape = (116,)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.add(Flatten())

In [10]:
from tensorflow.keras.optimizers import Adam
adam = Adam(learning_rate=0.001)
model.compile(loss="binary_crossentropy", optimizer=adam, metrics= ['accuracy'])

from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_prc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

model.summary()

In [11]:
# increasing the batch size made a huge difference 
baseline_history = model.fit(
    train_features,
    train_labels,
    batch_size=1024, 
    epochs=30,
    callbacks=[early_stopping],
    validation_data=(val_features, val_labels))

Epoch 1/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.5121 - loss: 0.7623 - val_accuracy: 0.5318 - val_loss: 0.6902
Epoch 2/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5208 - loss: 0.6953 - val_accuracy: 0.5370 - val_loss: 0.6869
Epoch 3/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5441 - loss: 0.6880 - val_accuracy: 0.5740 - val_loss: 0.6804
Epoch 4/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5664 - loss: 0.6797 - val_accuracy: 0.5910 - val_loss: 0.6748
Epoch 5/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5776 - loss: 0.6755 - val_accuracy: 0.5948 - val_loss: 0.6735
Epoch 6/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5852 - loss: 0.6725 - val_accuracy: 0.5981 - val_loss: 0.6708
Epoch 7/30
[1m73/73[0m [32m━━━━

In [12]:
from sklearn.metrics import roc_auc_score
test_predictions = model.predict(test_features)
roc_auc_score(y_true=test_labels,y_score=test_predictions)

[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  


0.632280400085202

In [13]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_features,train_labels)
lr_test_predictions = lr.predict(test_features)
roc_auc_score(test_labels, lr_test_predictions)

0.5904890633183016