In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import lightgbm as lgbm
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
import gc


from sklearn.metrics import mean_squared_error

In [3]:
def means_for_teams(data):
    g_data = data.groupby(['groupId']).mean()
    print(g_data.shape)

    for i, groupId in zip(data.index, data['groupId']):
        data.iloc[i][g_data.columns] = g_data.loc[groupId]
    return data

# features for duo or squad mode, these features are valued 0 in solo mode games
def drop_team_features(data):
    team_features = ['assists', 'DBNOs', 'revives', 'teamkills']
    return data.drop(team_features, axis=1)

def xy(data):
    X = data.drop('winPlacePerc', axis = 1).select_dtypes(['number'])
    y = data['winPlacePerc']
    return X, y

def fill_rankPoints(input_data, model = LinearRegression(), dropWinKillPoints=True):
    work_data = input_data.copy()
    work_data = pd.DataFrame(work_data.select_dtypes(['number']))

    #drop the other point features, since these will not be useful in predicting rankPoints
    work_data.drop(['winPoints', 'killPoints'], axis=1, inplace=True)

    #train the model with data where there are rankPoints
    train = pd.DataFrame(work_data.loc[data['rankPoints'] > 0, :])
    X_train = train.drop('rankPoints', axis=1)
    y_train = train['rankPoints']
    model.fit(X_train, y_train)

    #use model to predict missing rankPoints
    use = pd.DataFrame(work_data.loc[data['rankPoints'] <= 0, :])
    X_use = use.drop('rankPoints', axis=1)
    y_use = model.predict(X_use)

    #fill in the missing data
    work_data.loc[work_data['rankPoints'] <= 0, 'rankPoints'] = y_use

    return work_data


def print_top_feature_correlations_to_target_by_matchType(data, nrows):
    matchTypes = data['matchType'].value_counts().index.tolist()
    matchCounts = data['matchType'].value_counts().values.tolist()

    print("-----------------------------------------------------")
    print("Highest correlations to target feature BY matchType")
    print("Number of games:", nrows, "\n")
    for cnt, mt in zip(matchCounts, matchTypes):
        # for each matchtype
        # look at all the rows for that matchtype
        # and build a correlation matrix
        corr = data \
            .drop('winPlacePerc', axis=1) \
            .loc[data['matchType'] == mt] \
            .corrwith(data.loc[data['matchType'] == mt]['winPlacePerc'])
        corr = corr.sort_values(ascending=False)
        print("Type:", mt)
        print("Nr of games:", cnt)
        print(corr.head())
        print()


def scoreSets(dfs, random_state=1):
    for i, df in enumerate(dfs):
        lr = LinearRegression()
        x, y = xy(df)
        xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=random_state)
        lr.fit(xtrain, ytrain)
        print("Score for set nr: ", i+1, lr.score(xtest, ytest))

def divide_by_matchType_and_aggregate_by_groupId(data):
    matchTypes = data['matchType'].value_counts().index.tolist()
    squadMatches = []
    duoMatches = []
    soloMatches = []
    otherMatches = []
    for match in matchTypes:
        if 'squad' in match or 'flare' in match:
            squadMatches.append(match)
        elif 'duo' in match or 'crash' in match:
            duoMatches.append(match)
        elif 'solo' in match:
            soloMatches.append(match)
        else:
            otherMatches.append(match)

    squadSet = pd.DataFrame(data.loc[data['matchType'].isin(squadMatches),:])
    duoSet = pd.DataFrame(data.loc[data['matchType'].isin(duoMatches),:])
    soloSet = pd.DataFrame(data.loc[data['matchType'].isin(soloMatches),:])

    squadMeanSet = pd.DataFrame(squadSet.groupby('groupId').mean(), dtype='float16')
    duoMeanSet = pd.DataFrame(duoSet.groupby('groupId').mean(), dtype='float16')

    squadSets = [squadSet, squadMeanSet]
    duoSets = [duoSet, duoMeanSet]

    #print("Returning [[squadSet, squadMeanSet], [duoSet, duoMeanSet], soloSet]")
    return [squadSets, duoSets, soloSet]


In [4]:
# assign dtypes to minimize memory usage
dtypes = {
    'Id': 'object',
    'groupId': 'object',
    'matchId': 'object',
    'assists': 'uint8',
    'boosts': 'uint8',
    'damageDealt': 'float16',
    'DBNOs': 'uint8',
    'headshotKills': 'uint8',
    'heals': 'uint8',
    'killPlace': 'uint8',
    'killPoints': 'uint16',
    'kills': 'uint8',
    'killStreaks': 'uint8',
    'longestKill': 'float16',
    'maxPlace': 'uint8',
    'numGroups': 'uint8',
    'revives': 'uint8',
    'rideDistance': 'float16',
    'roadKills': 'uint8',
    'swimDistance': 'float16',
    'teamKills': 'uint8',
    'vehicleDestroys': 'uint8',
    'walkDistance': 'float16',
    'weaponsAcquired': 'uint8',
    'winPoints': 'uint8',
    'winPlacePerc': 'float16'
}

In [5]:
train_data = pd.read_csv("train_V2.csv", dtype=dtypes)
test_data = pd.read_csv("test_V2.csv", dtype=dtypes)

print(train_data.loc[:, train_data.isnull().any()].columns)
print(test_data.loc[:, test_data.isnull().any()].columns)

train_data[train_data['winPlacePerc'].isnull()]
train_data = train_data.dropna(subset=['winPlacePerc'])
train_data[train_data['winPlacePerc'].isnull()]

FileNotFoundError: [Errno 2] No such file or directory: 'train_V2.csv'

In [None]:
traino = train_data.copy()
testo = test_data.copy()
#traino = traino[['matchType','groupId','kills', 'damageDealt', 'killPlace', 'walkDistance', 'weaponsAcquired', 'boosts', 'heals', 'winPlacePerc']]
#testo = testo[['matchType','groupId','kills', 'damageDealt', 'killPlace', 'walkDistance', 'weaponsAcquired', 'boosts', 'heals']]

traino = train_data[:4000000]
testo = train_data[4000000:-1]
testo_y = testo.winPlacePerc
testo = testo.drop('winPlacePerc', axis=1)

In [None]:
trainSets = divide_by_matchType_and_aggregate_by_groupId(traino)
testSets = divide_by_matchType_and_aggregate_by_groupId(testo)

In [None]:
#solo_mod = RandomForestRegressor(n_estimators=120, max_features=0.5, min_samples_leaf=3, n_jobs=-1)
#duo_mod = RandomForestRegressor(n_estimators=120, max_features=0.5, min_samples_leaf=3, n_jobs=-1)
#squad_mod = RandomForestRegressor(n_estimators=120, max_features=0.5, min_samples_leaf=3, n_jobs=-1)

solo_mod = LinearRegression()
duo_mod = LinearRegression()
squad_mod = LinearRegression()

In [None]:

#testSets = divide_by_matchType_and_aggregate_by_groupId(test)

models = [solo_mod, duo_mod, squad_mod]

all_predictions = pd.DataFrame()
for train, test, model in zip(trainSets, testSets, models):
    if len(train) == 2:
        trainset = train[0]
        meanTrainSet = train[1] #only numeric values, the index is 'groupId'
        testset = test[0]
        X_test = test[1]

        X_train = meanTrainSet.drop('winPlacePerc', axis=1)
        y_train = meanTrainSet.winPlacePerc

        model.fit(X_train, y_train)

        yhat = model.predict(X_test)

        #now I will rejoin the prediction with the original row index
        yhat = pd.DataFrame(yhat, index=X_test.index,columns=['prediction'])
        testset = testset.join(yhat, on='groupId')
        yhat = testset.drop(testset.columns.difference(['prediction']), axis=1)

    else:
        X_train = train.select_dtypes(['number']).drop('winPlacePerc', axis=1)
        y_train = train.winPlacePerc
        X_test = pd.DataFrame(test.select_dtypes(['number']))

        model.fit(X_train, y_train)
        yhat = model.predict(X_test)

        X_test['prediction'] = yhat
        yhat = pd.DataFrame(X_test['prediction'], index=X_test.index, dtype='float32')

    all_predictions = all_predictions.append(yhat)

all_predictions.sort_index(inplace=True)
print(all_predictions.shape, testset.shape)
print(all_predictions.head())


In [None]:
# read in data. If memory is not an issue, lose the nrows parameter
nrows = 2500000
train = pd.read_csv("/kaggle/input/pubg-finish-placement-prediction/train_V2.csv", nrows=nrows, dtype=dtypes)
test = pd.read_csv("/kaggle/input/pubg-finish-placement-prediction/test_V2.csv", nrows=nrows, dtype=dtypes)


trainSets = divide_by_matchType_and_aggregate_by_groupId(train)
testSets = divide_by_matchType_and_aggregate_by_groupId(test)
models = [LinearRegression(), LinearRegression(), LinearRegression()]

all_predictions = pd.DataFrame()
for train, test, model in zip(trainSets, testSets, models):
    if len(train) == 2:
        trainset = train[0]
        meanTrainSet = train[1] #only numeric values, the index is 'groupId'
        testset = test[0]
        X_test = test[1]

        X_train = meanTrainSet.drop('winPlacePerc', axis=1)
        y_train = meanTrainSet.winPlacePerc

        model.fit(X_train, y_train)

        yhat = model.predict(X_test)

        #now I will rejoin the prediction with the original row index
        yhat = pd.DataFrame(yhat, index=X_test.index,columns=['prediction'])
        testset = testset.join(yhat, on='groupId')
        yhat = testset.drop(testset.columns.difference(['prediction']), axis=1)

    else:
        X_train = train.select_dtypes(['number']).drop('winPlacePerc', axis=1)
        y_train = train.winPlacePerc
        X_test = pd.DataFrame(test.select_dtypes(['number']))

        model.fit(X_train, y_train)
        yhat = model.predict(X_test)

        X_test['prediction'] = yhat
        yhat = pd.DataFrame(X_test['prediction'], index=X_test.index, dtype='float32')

    all_predictions = all_predictions.append(yhat)

all_predictions.sort_index(inplace=True)
print(all_predictions.shape, testset.shape)
print(all_predictions.head())

In [1]:
#split the data
X=train_data.drop(['winPlacePerc'],axis=1)
y=train_data['winPlacePerc']

NameError: name 'train_data' is not defined