In [None]:
# For compatibility across multiple platforms
import os
import numpy as np
import pandas as pd
from scipy import spatial


# Load files using DictReader in Python
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import cluster
from sklearn import preprocessing
from sklearn.model_selection import KFold
import sklearn


from ultimate.mlp import MLP 

from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint
from keras.layers import BatchNormalization

In [None]:
train_filepath = "pubg-finish-placement-prediction/train_V2_clean.csv"

#trainset_file = open(train_filepath,'rU')
trainset = pd.read_csv(train_filepath,index_col=0)

In [None]:
print(len(trainset))

In [None]:
def feature_engineering_two(moddedTrain):
    moddedTrain['playersJoined'] = moddedTrain.groupby('matchId')['matchId'].transform('count')
    moddedTrain['killsNorm'] = moddedTrain['kills']*((100-moddedTrain['playersJoined'])/100 + 1)
    moddedTrain['damageDealtNorm'] = moddedTrain['damageDealt']*((100-moddedTrain['playersJoined'])/100 + 1)
    moddedTrain['maxPlaceNorm'] = moddedTrain['maxPlace']*((100-moddedTrain['playersJoined'])/100 + 1)
    moddedTrain['matchDurationNorm'] = moddedTrain['matchDuration']*((100-moddedTrain['playersJoined'])/100 + 1)
    moddedTrain['healsandboosts'] = moddedTrain['heals'] + moddedTrain['boosts']
    moddedTrain['totalDistance'] = moddedTrain['rideDistance'] + moddedTrain['walkDistance'] + moddedTrain['swimDistance']
    moddedTrain['killsWithoutMoving'] = ((moddedTrain['kills'] > 0) & (moddedTrain['totalDistance'] == 0))
    moddedTrain['headshot_rate'] = moddedTrain['headshotKills'] / moddedTrain['kills']
    moddedTrain['headshot_rate'] = moddedTrain['headshot_rate'].fillna(0)
    moddedTrain.drop(moddedTrain[moddedTrain['killsWithoutMoving'] == True].index, inplace=True)
    moddedTrain.drop(moddedTrain[moddedTrain['roadKills'] > 8].index, inplace=True)
    

In [None]:
def feature_engineering(trainset,is_train=True):
    # When this function is used for the training data, load train_V2.csv :
    if is_train: 
        print("processing train_V2.csv")
        #df = pd.read_csv("pubg-finish-placement-prediction/train_V2_clean.csv",index_col=0)
        df = trainset
        # Only take the samples with matches that have more than 1 player 
        # there are matches with no players or just one player ( those samples could affect our model badly) 
        df = df[df['maxPlace'] > 1]
    
    # When this function is used for the test data, load test_V2.csv :
    else:
        print("processing test_V2.csv")
        df = pd.read_csv(INPUT_DIR + 'test_V2.csv')
        
    # Make a new feature indecating the total distance a player cut :
 
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]

    # Process the 'rankPoints' feature by replacing any value of (-1) to be (0) :
    df['rankPoints'] = np.where(df['rankPoints'] <= 0 ,0 , df['rankPoints'])
                           
    

    target = 'winPlacePerc'
    # Get a list of the features to be used
    features = list(df.columns)
    
    # Remove some features from the features list :
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchDuration")
    features.remove("matchType")
    
    y = None
    
    # If we are processing the training data, process the target
    # (group the data by the match and the group then take the mean of the target) 
    if is_train: 
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        # Remove the target from the features list :
        features.remove(target)
    
    # Make new features indicating the mean of the features ( grouped by match and group ) :
    print("get group mean feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    
    # If we are processing the training data let df_out = the grouped  'matchId' and 'groupId'
    if is_train: df_out = agg.reset_index()[['matchId','groupId']]
    # If we are processing the test data let df_out = 'matchId' and 'groupId' without grouping 
    else: df_out = df[['matchId','groupId']]
    
    # Merge agg and agg_rank (that we got before) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the max value of the features for each group ( grouped by match )
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the minimum value of the features for each group ( grouped by match )
    print("get group min feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the number of players in each group ( grouped by match )
    print("get group size feature")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
     
    # Merge the group_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the mean value of each features for each match :
    print("get match mean feature")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    
    # Merge the new agg with df_out :
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    # Make new features indicating the number of groups in each match :
    print("get match size feature")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    
    # Merge the match_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    # Drop matchId and groupId
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    
    y = y.tolist()
    

    return df_out,y

In [None]:
feature_engineering_two(trainset)
df_out,y = feature_engineering(trainset)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(df_out, y , test_size=0.2)

# SET UP FOR MODEL ONE

## Select the trainset features for tree one 

In [None]:
x_one = x_train
y_one = y_train

x_one_val = x_val
y_one_val = y_val


#set the dmatrix
#trainset_one_dmatrix = xgb.DMatrix(x_one.values,label=y_train.values,feature_names=x_one.columns)
#valset_one_dmatrix = xgb.DMatrix(x_one_val.values,label=y_one_val.values,feature_names=x_one_val.columns)

In [None]:
trainset_one_dmatrix = xgb.DMatrix(x_one.values,label=y_train,feature_names=x_one.columns)
valset_one_dmatrix = xgb.DMatrix(x_one_val.values,label=y_one_val,feature_names=x_one_val.columns)

In [None]:
x_one.head()

## Configure tree one

In [None]:
params = {
    "max_depth" : 7,
    "eval_metric" : ["mae"],
    "lambda": 1.1,   
}


In [None]:
iters = 30

tree_one = xgb.train(params, trainset_one_dmatrix, evals=[(trainset_one_dmatrix, "train"),(valset_one_dmatrix, 'val')], num_boost_round = iters)

xgb.plot_importance(tree_one,importance_type='cover')
xgb.plot_importance(tree_one,importance_type='weight')

# SET UP FOR MODEL TWO

## Select the trainset features for tree two

In [None]:
x_two = x_train
y_two = y_train

x_two_val = x_val
y_two_val = y_val

#scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1), copy=False).fit(x_two)

In [None]:
#scaler.transform(x_two)

## Define the NN

In [None]:
# create NN_model
NN_model = Sequential()
NN_model.add(Dense(x_two.shape[1],  input_dim = x_two.shape[1], activation='relu'))
NN_model.add(Dense(136, activation='relu'))
NN_model.add(BatchNormalization())
NN_model.add(Dense(136, activation='relu'))
NN_model.add(Dense(136, activation='relu'))
NN_model.add(Dense(136, activation='relu'))
NN_model.add(Dense(100, activation='relu'))
NN_model.add(Dense(100, activation='relu'))
NN_model.add(Dense(100, activation='relu'))
NN_model.add(Dense(100, activation='relu'))
NN_model.add(Dense(80, activation='relu'))
NN_model.add(Dense(80, activation='relu'))






# output Layer
NN_model.add(Dense(1, activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

In [None]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [None]:
NN_model.fit(x=x_two, y=y_two, batch_size=1000,
             epochs=20, verbose=1, callbacks=callbacks_list,
             validation_split=0.20, validation_data=None, shuffle=True,
             class_weight=None, sample_weight=None, initial_epoch=0,
             steps_per_epoch=None, validation_steps=None)


# ENSENBLE MODELS ON VALSET

### Predict - Update - Predict

In [None]:
def gen_predictions(models,labels,num_samples):
    labels = labels[:num_samples]
    
    predictions = []
    
    for model in models:
        data = model['data']
        predictor = model['predictor']

        if model['type'] == "tree":
            matrix = xgb.DMatrix(data,feature_names=data.columns)
            prediction = predictor.predict(matrix)
            predictions.append(prediction)
            print("tree")
        if model['type'] == "nn":
            prediction = predictor.predict(data)
            predictions.append(prediction)
            print("nn")
        else:
            continue

        
        
    print(len(predictions[0]))
    print(len(predictions[1]))

    avg_predictions = [0.0]*len(predictions[0])
    weight = 0
    for j in range(len(predictions[0])):
        if predictions[0][j] - predictions[1][j] <= .0001:
            avg_predictions[j] = 1.0 * (predictions[0][j] + predictions[1][j]) / 2.0
        else:
            avg_predictions[j] = predictions[0][j]

    print(len(avg_predictions))
    return avg_predictions
            

### Ensemble model predictions

In [None]:
models = []
models.append({'data':x_one_val,
                'predictor':NN_model,
                'type':"nn"})

models.append({'data':x_one_val,
                'predictor':tree_one,
                'type':"tree"})

labels = y_val

num_samples = 200000

predictions = gen_predictions(models,labels,num_samples)

In [None]:
print(sklearn.metrics.mean_absolute_error(labels,predictions))

In [None]:
print(sklearn.metrics.mean_absolute_error(labels,tree_one.predict(valset_one_dmatrix)))