#### TSV files are essentially identical to CSV files except that TSV files use "tabs (\t)" while CSV files use commas to store data in tabular structure. As a result, loading TSV files are slightly different from how we've been loading CSV files.
##### Thanks to Clara Meister for providing this tutorial.

In [1]:
# For compatibility across multiple platforms
import os
import numpy as np
import pandas as pd
from scipy import spatial
import seaborn as sn

# Load files using DictReader in Python
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import cluster
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper
import sklearn
import sklearn_pandas


In [2]:
train_filepath = "pubg-finish-placement-prediction/train_V2_clean.csv"

test_filepath = "pubg-finish-placement-prediction/test_V2.csv"

#trainset_file = open(train_filepath,'rU')
trainset = pd.read_csv(train_filepath)
trainset = trainset.drop(['matchType'],axis=1)

#testset_file = open(test_filepath,'rU')
testset = pd.read_csv(test_filepath)

print(trainset.head())
testset.head()
print(len(trainset))

   Unnamed: 0                 Id            groupId            matchId  \
0           0  35913017459246474  21756414768994750  45321147693812369   
1           1  67264846101073980  29358430787743646  49173965273764108   
2           2   8637285204745842  29917998133566068   4786602953643182   
3           3  19728345572649043  47622776820651809  68101503675608446   
4           4  13894076435569324  62491847359224029  30901772270576102   

   assists  boosts  damageDealt  DBNOs  headshotKills  heals      ...       \
0        0       0         0.00      0              0      0      ...        
1        0       0        91.47      0              0      0      ...        
2        1       0        68.00      0              0      0      ...        
3        0       0        32.90      0              0      0      ...        
4        0       0       100.00      0              0      0      ...        

   revives  rideDistance  roadKills  swimDistance  teamKills  vehicleDestroys  \
0    

In [3]:
#STEP 2


#make baseline train and validation set --> 80:20 --> we can do kfold validation later 
print("len old trainset",len(trainset))

trainset_split = int(.8 * len(trainset))
valset = trainset[trainset_split:]
trainset = trainset[:trainset_split]

print("len new trainset",len(trainset))
print("len valset", len(valset))

len old trainset 4446965
len new trainset 3557572
len valset 889393


In [4]:
trainset.columns.values

array(['Unnamed: 0', 'Id', 'groupId', 'matchId', 'assists', 'boosts',
       'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace',
       'killPoints', 'kills', 'killStreaks', 'longestKill',
       'matchDuration', 'maxPlace', 'numGroups', 'rankPoints', 'revives',
       'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints',
       'winPlacePerc'], dtype=object)

In [5]:
trainset_sorted = dict()

#sort by matches 
trainset_sorted['matchId'] = trainset.groupby('matchId')

In [8]:
#check to make sure sorting is all good
print("len of trainset",len(trainset))
print("num matches",len(trainset_sorted['matchId']))
print(trainset.shape)



len of trainset 3557572
num matches 47964
(3557572, 29)


In [7]:
#bucket labels 
def bucket_labels(set_a,set_b,n_classes):
    le = preprocessing.LabelEncoder() #100 classes

    trainset_labels = le.fit_transform(set_a[outcome])
    print(trainset_labels)
    print(len(trainset_labels))
    valset_labels = le.fit_transform(set_a[outcome])
    labels = [trainset_labels,valset_labels]
    result = []
    for curr_labels in labels:
        current_classes = max(curr_labels)
        curr_result = []
        for label in curr_labels:
            n = ((1.0 * label)/current_classes)*n_classes
            curr_result.append(round(n))
        result.append(curr_result)
    
    print(min(result[0]))
    return (result[0],result[1])
            



In [31]:
#STEP 1
#start of fraction section
#KNN

outcome = 'winPlacePerc'

features = ['kills','assists','damageDealt','heals','weaponsAcquired','winPoints','assists']
neighbors = 15
abbrev_trainset_labels = trainset[outcome]
abbrev_valset_labels = valset[outcome]
abbrev_trainset = trainset[features]
abbrev_valset = valset[features]
scaler = StandardScaler()
scaler.fit(abbrev_trainset)  # Don't cheat - fit only on training data
X_train = scaler.transform(abbrev_trainset)
X_test = scaler.transform(abbrev_valset)  # apply same transformation to test data
X_test = pd.DataFrame(X_test, columns=features)
X_train = pd.DataFrame(X_train, columns=features)
print(type(X_train))
print(X_train.head())




# trainset_labels,valset_labels = bucket_labels(abbrev_trainset,abbrev_valset,100)

# predictions = dict()
# classifier = None 
# for i in range(1,neighbors+1):
#     print(i)
#     classifier = KNeighborsClassifier(i)
#     classifier.fit(abbrev_trainset[features], trainset_labels)
#     predictions[i] = classifier.predict(abbrev_valset[features])


# cm = confusion_matrix(valset_labels, multi_class_predict)
# plt.figure(figsize = (100,70))
# sn.heatmap(cm, annot=True)
# sn.set(font_scale=1.4)
# plt.xlabel("Predicted")
# plt.ylabel("Truth")

#Step3 
#SVM RBF kernel
# svm = SVR().fit(abbrev_trainset[features],abbrev_trainset[outcome])
# svm_predictions = svm.predict(abbrev_valset[features])
# mea_svm_predictions = sklearn.metrics.mean_absolute_error(abbrev_valset[outcome], svm_predictions)
# print(mea_svm_predictions)
# cm_rbf = confusion_matrix(valset_labels, svm_predictions)
# plt.figure(figsize = (100,70))
# sn.heatmap(cm_rbf, annot=True)
# sn.set(font_scale=1.4)
# plt.xlabel("Predicted")
# plt.ylabel("Truth")

    

  return self.partial_fit(X, y)


<class 'pandas.core.frame.DataFrame'>
      kills   assists  damageDealt     heals  weaponsAcquired  winPoints  \
0 -0.592951 -0.397281    -0.764970 -0.511302        -1.081570   1.161743   
1 -0.592951 -0.397281    -0.229705 -0.511302         0.543929  -0.820045   
2 -0.592951  1.302284    -0.367047 -0.511302        -0.675195  -0.820045   
3 -0.592951 -0.397281    -0.572445 -0.511302        -0.268820  -0.820045   
4  0.048165 -0.397281    -0.179789 -0.511302        -0.675195  -0.820045   

    assists  
0 -0.397281  
1 -0.397281  
2  1.302284  
3 -0.397281  
4 -0.397281  


  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [32]:
#Step 2 
#Linear Regression
clf = linear_model.SGDRegressor(tol=1e-3, verbose=1)
clf.fit(X_train,abbrev_trainset_labels)
lr_predict = clf.predict(X_test)
mae_lr = sklearn.metrics.mean_absolute_error(abbrev_valset_labels, lr_predict)
# lr_train_predict = lr.predict(abbrev_trainset[features])
# mea_train_lr = sklearn.metrics.mean_absolute_error(abbrev_trainset[outcome], lr_train_predict)
print(mae_lr)

-- Epoch 1
Norm: 0.14, NNZs: 7, Bias: 0.471820, T: 3557572, Avg. loss: 0.026285
Total training time: 0.76 seconds.
-- Epoch 2
Norm: 0.16, NNZs: 7, Bias: 0.470480, T: 7115144, Avg. loss: 0.026207
Total training time: 1.52 seconds.
-- Epoch 3
Norm: 0.16, NNZs: 7, Bias: 0.474566, T: 10672716, Avg. loss: 0.026185
Total training time: 2.26 seconds.
-- Epoch 4
Norm: 0.16, NNZs: 7, Bias: 0.473068, T: 14230288, Avg. loss: 0.026161
Total training time: 3.01 seconds.
-- Epoch 5
Norm: 0.16, NNZs: 7, Bias: 0.475016, T: 17787860, Avg. loss: 0.026157
Total training time: 3.75 seconds.
-- Epoch 6
Norm: 0.17, NNZs: 7, Bias: 0.470724, T: 21345432, Avg. loss: 0.026152
Total training time: 4.49 seconds.
Convergence after 6 epochs took 4.49 seconds
0.1804092407802669


In [None]:

n_classes = max(valset_labels)
for i in range(1,neighbors):
    print(i,sklearn.metrics.mean_absolute_error(valset_labels,predictions[i]))
    

In [None]:
#Forest of trees
features = ['year','age','genre1_num','gender_num']
trees = 10
rf = RandomForestClassifier(n_estimators=trees)
rf.fit(train[features],train['rating'])
predictions_FT = rf.predict(test[features])
# Calculate accuracy
numtrain = len(train)
numtest = len(test)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category']
#    if predictions_FT[i] == test.loc[numtrain+i]['rating']: correct +=1
#print 'Accuracy:', float(correct)/float(numtest)
       
       
    print 'Predicted:', predictions_FT[i], ' Actual:', train.iloc[i]['rating']
    if predictions_FT[i] == test.iloc[i]['rating']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)

In [None]:
#Naive bayes
features = ['year','age','genre1_num','gender_num']
nb = GaussianNB()
nb.fit(train[features],train['rating'])
predictions_NB = nb.predict(test[features])
# Calculate accuracy
numtrain = len(train)
numtest = len(test)
correct = 0
for i in range(numtest):
    print('Predicted:', predictions_NB[i], ' Actual:', train.iloc[i]['rating'])
    if predictions_NB[i] == test.iloc[i]['rating']: correct +=1
print('Accuracy:', float(correct)/float(numtest))

In [None]:
#find mean
avgpredicts = []
for i in range(201):
    avgpredict=1.0*(predictions_KNN[i]+predictions_FT[i]+predictions_NB[i])/2
    avgpredicts.append(avgpredict)
for i, avgpredict in enumerate(avgpredicts):
    if avgpredict > 5: 
        avgpredicts[i] = 5.0
print(avgpredicts)

In [None]:
#STEP 3


#Exports real numbers predictions to csv - named V1predict.csv
import csv
input_file = csv.reader(open("predict.csv"))
# This is the file where you will save the result.
ofile = open('V1predict.csv', "w")
# This is a way to use csv.writer.
writer = csv.writer(ofile, quotechar='"', quoting=csv.QUOTE_ALL)
# loop over each row of predict.csv
for row_num, row in enumerate(input_file):
    # if header row, just write it and does nothing else
    if row_num == 0:
        writer.writerow(row)
        continue
    # if not header row, put your own prediction for the second column of the current row
    row[2] = str(avgpredicts[row_num-1])
    # write out the row to the file
    writer.writerow(row)
# finish writing to the file and close it
ofile.close()

In [None]:
#STEP 3


#Exports integers predictions to csv - named V2predict.csv

import csv
input_file = csv.reader(open("predict.csv"))
# This is the file where you will save the result.
ofile = open('V2predict.csv', "w")
# This is a way to use csv.writer.
writer = csv.writer(ofile, quotechar='"', quoting=csv.QUOTE_ALL)
# loop over each row of predict.csv
for row_num, row in enumerate(input_file):
    # if header row, just write it and does nothing else
    if row_num == 0:
        writer.writerow(row)
        continue
    # if not header row, put your own prediction for the second column of the current row
    row[2] = str(predictions[row_num-1])
    # write out the row to the file
    writer.writerow(row)
# finish writing to the file and close it
ofile.close()