In [None]:
#IMPORTS

import numpy as np
import random
import tensorflow as tf
import tensorflow.keras as kr
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.datasets import mnist

from scipy.spatial.distance import euclidean
from sklearn.metrics import confusion_matrix

from time import sleep
from tqdm import tqdm

import copy

import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from numpy.random import RandomState
import scipy as scp
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras import optimizers
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.utils import to_categorical
from keras import backend as K
from itertools import product
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn import mixture

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
rs = RandomState(92) #To reproduce the same results each time we run this notebook

In [None]:
#Load dataset into a pandas DataFrame
df = pd.read_csv('athlete_events.csv', na_values='?')

In [None]:
df['Medal'].fillna('No Medal', inplace = True)

## Preliminary Data Analysis

In [None]:
# Drop all records with missing values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

#Data columns and their types
df.info()
df.head(10)

In [None]:
#Load the second dataset
region = pd.read_csv('noc_regions.csv', na_values='?')

In [None]:
#Join the two datasets
merged = pd.merge(df, region, on='NOC', how='left')

In [None]:
merged.drop('notes', inplace=True,axis=1)

In [None]:
#check if a country exists in dataset

if '' in merged.values:
    print('\n This value exists in Dataframe')
    
else:
    print('\n This value does not exists in Dataframe')

In [None]:
#compute the median of height
merged['Height'].median()

In [None]:
min(merged.Height)

In [None]:
max(merged.Height)

In [None]:
merged.groupby('Sex')['Height'].min()
merged.groupby('Sex')['Height'].idxmin()

In [None]:
merged.groupby('Sex')['Height'].max()
merged.groupby('Sex')['Height'].idxmax()

##### Labelling height according to the median:

In [None]:
merged['Height_class'] = np.where(merged['Height']<175.0, '0', '1')

##### These countries in 'l' will be labelled as south asians (SA):

In [None]:
l = ['Indonesia', 'Vietnam', 'Philippines', 'Malaysia', 'Sri Lanka', 'Thailand', 'India', 'Pakistan', 'Maldives', 'Afghanistan', 'Bangladesh', 'Bhutan', 'Nepal', 'Brunei', 'Cambodia', 'Laos', 'Myanmar', 'Japan']

In [None]:
label = merged['region'].apply(lambda x: 0 if x in l else 1)

In [None]:
merged['Country']= label

In [None]:
new=[merged['Sex']=='M', merged['Country']==0]

In [None]:
new1=[merged['Sex']=='M', merged['Country']==1]

In [None]:
merged['Sex'].replace('M', 0, inplace=True)
merged['Sex'].replace('F', 1, inplace=True)

In [None]:
males = merged.loc[merged['Sex'] == 0]

##### Plotting the proportion distribution of countries according to height:

In [None]:
Males = round(pd.crosstab(males.Country, males.Height_class).div(pd.crosstab(males.Country, males.Height_class).apply(sum,1),0),2)
Males.sort_values(by = '1', inplace = True)
ax = Males.plot(kind ='bar', color=['#deb887','#8b4513'], title = 'Proportion distribution of countries according to height ', figsize = (8,6))
ax.set_xlabel('Country')
ax.set_ylabel('Proportion of population')

print()

In [None]:
merged.drop(['Team','region','Games','Event','Name'], axis=1, inplace = True)

In [None]:
categorical_features = ['NOC', 'City', 'Season','Sport','Medal','Year']

In [None]:
athletes = pd.get_dummies(merged, columns=categorical_features, drop_first=True)

##### Find the height column:

In [None]:
m = len(athletes.columns)

In [None]:
idx = [i for i in range(m) if i != 5]
# idx

In [None]:
X = athletes.iloc[:, idx].values
y = athletes.iloc[:, 6].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))
print(X_train.shape)

##### Normalization:

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#### Training with neural nets:

In [None]:
# BASELINE SCENARIO
def build_model():
    model = Sequential()

    # Adding the input layer and the first hidden layer
    model.add(Dense(output_dim = 350, activation = 'relu', input_dim = 371))
    # Adding the second hidden layer
    model.add(Dense(output_dim = 200, activation = 'relu'))
    # Adding the output layer
    model.add(Dense(output_dim = 1, activation = 'sigmoid'))

    opt = keras.optimizers.Adam(learning_rate=3e-4)
    model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])

    model.fit(X_train, y_train, batch_size = 10, nb_epoch = 100, validation_split=0.2)
    return model

model = build_model()

#### Confusion matrix for SA countries:

In [None]:
filter_rows = X_test[:,6]==0
X_test_sa = X_test[filter_rows,:]
y_test_sa = y_test[filter_rows]
y_pred_sa = model.predict(X_test_sa)
y_pred_s = np.where(y_pred_sa>=0.5, 1,0)

print(classification_report(y_test_sa, y_pred_s))
cm_sa = confusion_matrix(y_test_sa, y_pred_s)
tn,fp,fn,tp = cm_sa.ravel()

#### Performance measures:

In [None]:
#accuracy: (tp + tn)/(tp + tn + fp + fn)
accuracy = accuracy_score(y_test_sa, y_pred_s)
print('Accuracy: %f' % accuracy)
#precision: tp/tp+fp
precision = precision_score(y_test_sa, y_pred_s)
print('Precision: %f' % precision)
#recall: tp/tp+fn
recall = recall_score(y_test_sa, y_pred_s)
print('Recall: %f' % recall)
#FNR: 1-recall
fnr = 1 - recall
print('FNR: %f' % fnr)
#FPR: fp/fp+tn
fpr = fp / (fp + tn)
print('FPR: %f' % fpr)
#f1: 2 tp/ (2 tp + fp + fn)
f1 = f1_score(y_test_sa, y_pred_s)
print('F1 score: %f' % f1)

#### ROC AUC score and Gini coefficient:

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test_sa, y_pred_s)
roc_auc = auc(false_positive_rate, true_positive_rate)


plt.title('Receiver Operating Characteristic Curve')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

Gini_coefficient=2*roc_auc - 1
print ("Gini_coefficient =",Gini_coefficient)

#### Confusion matrix for NSA countries:

In [None]:
filter_rows = X_test[:,6]==1
X_test_nsa = X_test[filter_rows,:]
y_test_nsa = y_test[filter_rows]
y_pred_nsa = model.predict(X_test_nsa)
y_pred_ns = np.where(y_pred_nsa>=0.5, 1,0)

print(classification_report(y_test_nsa, y_pred_ns))
cm_nsa = confusion_matrix(y_test_nsa, y_pred_ns)
tn,fp,fn,tp = cm_nsa.ravel()

#### Performance measures: 

In [None]:
#accuracy: (tp + tn)/(tp + tn + fp + fn)
accuracy = accuracy_score(y_test_nsa, y_pred_ns)
print('Accuracy: %f' % accuracy)
#precision: tp/tp+fp
precision = precision_score(y_test_nsa, y_pred_ns)
print('Precision: %f' % precision)
#recall: tp/tp+fn
recall = recall_score(y_test_nsa, y_pred_ns)
print('Recall: %f' % recall)
#FNR: 1-recall
fnr = 1 - recall
print('FNR: %f' % fnr)
#FPR: fp/fp+tn
fpr = fp / (fp + tn)
print('FPR: %f' % fpr)
#f1: 2 tp/ (2 tp + fp + fn)
f1 = f1_score(y_test_nsa, y_pred_ns)
print('F1 score: %f' % f1)

#### ROC AUC score and Gini coefficient:

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test_nsa, y_pred_ns)
roc_auc = auc(false_positive_rate, true_positive_rate)


plt.title('Receiver Operating Characteristic Curve')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

Gini_coefficient=2*roc_auc - 1
print ("Gini_coefficient =",Gini_coefficient)