In [1]:
# Linear Regression - 선형회귀분석
# formula = Y = w*x + b (weight, bias)

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import *

import seaborn as sns
sns.set()

In [2]:
%sh
pip install kaggle
kaggle competitions download -c pubg-finish-placement-prediction
unzip train_V2.csv.zip
unzip test_V2.csv.zip

In [3]:
dbutils.fs.cp("file:///databricks/driver/train_V2.csv", "dbfs:/FileStore/tables/pubg/PUBG_train.csv")
dbutils.fs.cp("file:///databricks/driver/test_V2.csv", "dbfs:/FileStore/tables/pubg/PUBG_test.csv")

In [4]:
train = sqlContext.read.format("com.databricks.spark.csv") \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .load("FileStore/tables/pubg/pubg_train.csv")

In [5]:
display(train)

Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,maxPlace,numGroups,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,24,0,0,5,247.3,2,0,4,17,1050,2,1,65.32,29,28,1,591.3,0,0.0,0,0,782.4,4,1458,0.8571
1,440875,1,1,0,37.65,1,1,0,45,1072,1,1,13.55,26,23,0,0.0,0,0.0,0,0,119.6,3,1511,0.04
2,878242,2,0,1,93.73,1,0,2,54,1404,0,0,0.0,28,28,1,0.0,0,0.0,0,0,3248.0,5,1583,0.7407
3,1319841,3,0,0,95.88,0,0,0,86,1069,0,0,0.0,97,94,0,0.0,0,0.0,0,0,21.49,1,1489,0.1146
4,1757883,4,0,1,0.0,0,0,1,58,1034,0,0,0.0,47,41,0,0.0,0,0.0,0,0,640.8,4,1475,0.5217
5,2200824,5,0,2,128.1,0,0,0,25,1000,1,1,27.3,96,96,0,2221.0,0,0.0,0,0,1016.0,4,1500,0.9368
6,2568717,6,1,0,130.3,0,0,0,28,1037,1,1,5.954,44,40,0,721.7,0,0.0,0,0,280.1,3,1495,0.3721
7,2612473,7,1,1,661.8,2,3,2,3,1148,5,2,36.64,46,46,0,0.0,0,0.0,0,0,2617.0,4,1479,1.0
8,2656377,8,0,3,94.72,0,0,5,50,1286,0,0,0.0,28,28,0,2963.0,0,28.9,0,0,3139.0,5,1528,0.7037
9,2700597,9,0,0,137.6,0,0,0,81,1000,0,0,0.0,25,23,0,0.0,0,0.0,0,0,238.7,3,1500,0.0417


In [6]:
display(train.describe())

summary,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,maxPlace,numGroups,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0,4357336.0
mean,3102678.01069828,1024776.560199397,23855.392916451703,0.2656320283769716,0.9636856097395288,132.90326821422178,0.6901455384666227,0.2385866042921638,1.1871689491010105,47.03440198323012,1080.8976083092973,0.9344957561225484,0.5547651133628437,19.87924673527583,42.07758846230816,40.58034634005732,0.1649344920841541,423.8835335611028,0.0025579849706334,4.136260867465809,0.013885548417657,0.0051462636803771,1055.12213177269,3.457289270324804,1500.5039951015942,0.4718663017495083
stddev,1797477.3197264962,696719.6919302202,13782.272299694396,0.6342160303411152,1.560642710308334,169.94389352650492,1.191514206438664,0.61030326379283,2.3663885075188,27.32772143176811,123.71191546557176,1.566192327320556,0.7223451790430968,45.86551689620742,22.67769352351892,22.066673463904937,0.4672004289860906,1222.9265825360785,0.0634667857351662,27.57014633014081,0.1329266179898138,0.0742536162372149,1116.1224223860593,2.4021091989660914,42.53570709030178,0.3079147083927654
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,155.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,350.0,0.0
max,6224522.0,2700605.0,47733.0,20.0,18.0,6384.0,63.0,26.0,59.0,100.0,2047.0,60.0,14.0,1323.0,100.0,100.0,41.0,48390.0,42.0,5286.0,6.0,5.0,17300.0,76.0,1923.0,1.0


In [7]:
print(train.columns)
print(len(train.columns))

In [8]:
train.cache()
train.count()

In [9]:
train.registerTempTable("data_geo")
train.printSchema()

In [10]:
display(train.select("winPlacePerc"))

winPlacePerc
0.8571
0.04
0.7407
0.1146
0.5217
0.9368
0.3721
1.0
0.7037
0.0417


In [11]:
display(train.groupBy("matchId").count())

matchId,count
148,98
463,93
471,99
496,92
833,96
1088,98
1238,96
1342,95
1580,94
1591,99


In [12]:
display(train.filter(train.winPlacePerc == 1).sort(train.winPlacePerc, ascending=False))

Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,maxPlace,numGroups,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
1571815,2678195,9492,1,6,434.4,2,2,1,2,1000,5,1,236.5,44,41,1,1192.0,0,0.0,0,0,4622.0,5,1500,1.0
1571834,2679123,9511,1,3,333.3,3,0,0,5,1000,3,1,29.11,18,18,0,0.0,0,0.0,0,0,1906.0,6,1500,1.0
1571896,2682102,9573,2,2,473.4,2,0,0,4,1259,4,2,44.29,48,46,0,0.0,0,0.0,0,0,3475.0,8,1484,1.0
1571944,2684117,9621,0,2,43.86,1,0,2,22,1167,1,1,23.72,29,27,0,0.0,0,0.0,0,0,3298.0,2,1524,1.0
1571948,2684284,9625,0,2,134.6,0,0,1,11,1009,2,1,28.58,48,43,0,0.0,0,0.0,0,0,2108.0,5,1473,1.0
1572082,2690359,9760,0,2,52.79,0,0,0,39,1011,0,0,0.0,29,28,0,0.0,0,0.0,0,0,2341.0,4,1513,1.0
1572096,2691016,9774,2,1,358.7,3,0,3,4,1000,3,2,45.13,24,22,1,0.0,0,0.0,0,0,2159.0,6,1500,1.0
1572121,2691976,9800,1,4,273.4,1,2,4,10,1263,2,2,91.0,31,28,1,472.3,0,0.0,0,1,2572.0,4,1507,1.0
1572190,2694894,9869,5,1,291.2,1,0,1,11,1000,2,1,11.67,22,21,2,0.0,0,0.0,0,0,2346.0,3,1500,1.0
1572229,2696572,9908,1,1,656.7,3,3,4,1,1098,6,2,152.4,49,46,1,0.0,0,0.0,0,0,3158.0,6,1499,1.0


In [13]:
display(train.filter(train.winPlacePerc <= 0.3).sort(train.winPlacePerc, ascending=False))

Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,maxPlace,numGroups,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
1786903,546513,22416,2,1,116.5,0,0,1,70,962,0,0,0.0,31,30,0,0.0,0,0.0,0,0,210.5,3,1488,0.3
367,1177498,367,0,1,304.9,3,1,0,8,1000,3,2,50.42,21,20,0,0.0,0,0.0,0,0,684.7,3,1500,0.3
1785585,489304,21081,0,0,87.25,0,0,1,50,1000,0,0,0.0,21,20,0,0.0,0,0.0,0,0,520.4,3,1500,0.3
1643,285107,1643,0,3,0.0,0,0,5,26,903,0,0,0.0,11,10,0,2774.0,0,0.0,0,0,826.1,3,1445,0.3
1748,331306,1748,0,0,49.14,0,0,0,66,1262,0,0,0.0,31,29,0,0.0,0,0.0,0,0,999.7,7,1631,0.3
1778734,185667,14154,1,0,99.45,0,0,0,68,1142,0,0,0.0,91,87,0,0.0,0,0.0,0,0,447.4,3,1486,0.3
1785520,486517,21016,0,0,43.56,0,0,0,58,1000,0,0,0.0,21,20,0,0.0,0,0.0,0,0,715.1,3,1500,0.3
2939,853081,2939,0,0,0.0,0,0,0,71,997,0,0,0.0,31,28,0,0.0,0,0.0,0,0,386.3,3,1507,0.3
3747,1208206,3747,1,1,126.9,0,0,3,53,1000,0,0,0.0,21,21,0,0.0,0,0.0,0,0,159.7,2,1500,0.3
4274,1440984,4274,0,1,275.0,3,1,2,19,1000,2,1,8.184,21,21,1,0.0,0,0.0,0,0,136.4,2,1500,0.3


In [14]:
features = train.columns[0:-1]
print(features)

In [15]:
# pdtrain = train.toPandas()
# pdtrain.head()

pubg_data = train.select(col('winPlacePerc').alias("label"), *features)
print(pubg_data.printSchema())
print(pubg_data.show(3))

In [16]:
(training, test) = pubg_data.randomSplit([.7, .3])

In [17]:
from pyspark.ml.feature import VectorAssembler
VectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")

VectorAssembler

# 2. numeric data일 때는 피쳐들을 scale한다. 
StandardScaler = StandardScaler(inputCol = "unscaled_features", outputCol="features")

# 3. Linear Regression modeling
lr = LinearRegression(maxIter=10, regParam=.01)

stages = [VectorAssembler, StandardScaler, lr]
pipeline = Pipeline(stages=stages)

In [18]:
model = pipeline.fit(training)
prediction = model.transform(test)

In [19]:
prediction.show(5)

In [20]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(prediction)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval.evaluate(prediction, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval.evaluate(prediction, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(prediction, {eval.metricName: "r2"})
print("r2: %.3f" %r2)