# Imports

In [1]:
import csv
import os
import sys
# Spark imports
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql.types import IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import operator
# Dask imports
import dask.bag as db
import dask.dataframe as df  # you can use Dask bags or dataframes
from csv import reader
import numpy as np
import pandas as pd
import datetime
from pyspark.sql.functions import *
import sklearn
from sklearn import preprocessing
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
# from scipy.sparse import csr_matrix
# import scipy as sp
import heapq
# from surprise import CoClustering
# from surprise import Dataset, Reader, SVD, accuracy
# from surprise import KNNBaseline


# Packages for model evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from time import time

# from surprise.model_selection import train_test_split
# from sklearn.model_selection import train_test_split

# Package to suppress warnings
import warnings
warnings.filterwarnings("ignore")
from pyspark.sql.functions import monotonically_increasing_id
# Packages for saving models
import pickle

from re import split
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, DoubleType, IntegerType, StringType, DateType
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
to_exclude = ['round']

for name in to_exclude:
    del globals()[name]

In [2]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [3]:
spark=init_spark()

22/04/12 21:31:26 WARN Utils: Your hostname, Sams-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 172.30.12.119 instead (on interface en0)
22/04/12 21:31:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/04/12 21:31:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/12 21:31:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Generate Data For Base Game DataSet

In [4]:
def generateData():
    path_to_write = parentPath+'/data/datasets'
    originalDataSet = pd.read_csv(parentPath+'/data/datasets/steam.csv')
    if os.path.exists(path_to_write+'/games.csv'):
        return pd.read_csv(path_to_write+'/games.csv')
    for row in originalDataSet.iterrows():
        row =row[1]
        totalRatings = (row['positive_ratings'] + row['negative_ratings'])
        if totalRatings == 0:
            ratings = 0
        else:
            ratings = round((row['positive_ratings']/totalRatings) * 100,2)
        dictionnary[row['appid']] = {
                    "name": row['name'],
                    "price": row['price'],
                    "release_date": row['release_date'],
                    "required_age": row['required_age'],
                    "publishers": row['publisher'],
                    "developers": row['developer'],
                    "categories": row['categories'],
                    "genres": row['genres'],
                    "ratings": ratings,
                    "totalRatings": totalRatings,
                    "average_playtime": row['average_playtime'],
                    "median_playtime": row['median_playtime'],
                    "num_owners": row['owners']
        }
    
    

    df = pd.DataFrame.from_dict(dictionnary, orient='index')
    df.index.name = 'appid'
    if not os.path.exists(path_to_write):
        os.mkdir(path_to_write)
    
    df.to_csv(path_to_write+'/games.csv')
    return df

## Generate the User Steam Dataset

In [5]:
url_user_info = "https://api.steampowered.com/IPlayerService/GetOwnedGames/v1/?key=D399B9CAE07939881781DD36CC8CC442&steamid={}&include_appinfo=true&include_played_free_games=true"
def generateDataForUserSteamDataset():
    dictionary=[]
    path_to_write = parentPath+'/data/datasets'
    if os.path.exists(path_to_write+'/steam_id_games.csv'):
        return pd.read_csv(path_to_write+'/steam_id_games.csv')
    originalDataSet = pd.read_csv(parentPath+'/data/datasets/steam_id.csv')
    for row in originalDataSet.iterrows():
        row =row[1]
        try:
            games = requests.get(url_user_info.format(row['steamid_a'])).json()['response']
        except:
            continue
        if 'games' not in games:
            continue
        dataset = [{'steam_id':row['steamid_a'],"appid" :str(game['appid']), "time_played_in_minutes": game['playtime_forever'] } for game in games['games']]
        dictionary = dataset+dictionary
    df = pd.DataFrame.from_records(dictionary)
    if not os.path.exists(path_to_write):
        os.mkdir(path_to_write)
    
    df.to_csv(path_to_write+'/steam_id_games.csv')
    return df

In [6]:
filename="../data/datasets/games.csv"
filename2="../data/datasets/steam_id_games.csv"

In [7]:
from pyspark.sql.types import *
games_schema = StructType() \
    .add("appid", IntegerType(), True) \
    .add("name", StringType(), True) \
    .add("price", DoubleType(), True) \
    .add("release_date", StringType(), True) \
    .add("required_age", IntegerType(), True) \
    .add("publishers", StringType(), True) \
    .add("developers", StringType(), True) \
    .add("categories", StringType(), True) \
    .add("genres", StringType(), True) \
    .add("ratings", DoubleType(), True) \
    .add("totalRatings", IntegerType(), True) \
    .add("average_playtime", DoubleType(), True) \
    .add("median_playtime", IntegerType(), True) \
    .add("num_owners", StringType(), True)
games = spark.read.schema(games_schema).csv(filename, header=True).dropna()


In [8]:
games.take(5)

[Row(appid=10, name='Counter-Strike', price=7.19, release_date='2000-11-01', required_age=0, publishers='Valve', developers='Valve', categories='Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled', genres='Action', ratings=97.39, totalRatings=127873, average_playtime=17612.0, median_playtime=317, num_owners='10000000-20000000'),
 Row(appid=20, name='Team Fortress Classic', price=3.99, release_date='1999-04-01', required_age=0, publishers='Valve', developers='Valve', categories='Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled', genres='Action', ratings=83.98, totalRatings=3951, average_playtime=277.0, median_playtime=62, num_owners='5000000-10000000'),
 Row(appid=30, name='Day of Defeat', price=3.99, release_date='2003-05-01', required_age=0, publishers='Valve', developers='Valve', categories='Multi-player;Valve Anti-Cheat enabled', genres='Action', ratings=89.56, totalRatings=3814, average_playtime=187.0, median_playtime=34, num_ow

## Normalization

In [9]:
# normalizer = Normalizer(inputCol="price", outputCol="price")
# myGames = normalizer.transform(myGames)
#hasher = FeatureHasher(inputCols=["price", "release_date", "required_age", "ratings","totalRatings","average_playtime","median_playtime","Action"],
#                       outputCol="features")

#featurized = hasher.transform(myGames)
#listOfGames=featurized.take(100)

#featurized.head()

#myGames3 = myGames.take(10000)
#normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
#l1NormData = normalizer.transform(featurized)
#l1NormData.head()

# lInfNormData = normalizer.transform(featurized, {normalizer.p: float("inf")})
#listOfGames=l1NormData.take(10)
# listOfGames.reshape(-1, 1)
# X_normalized = normalizer(myGames, copy=True)
#type(listOfGames)
#listOfGames=np.array([listOfGames], dtype=object)

## Data Preprocessing

In [10]:
# get the yaer of release_date
games = games.withColumn('release_date', col('release_date')[0:4])
games = games.withColumn("release_date", games["release_date"].cast(IntegerType()))

In [11]:
#create columns for genres
games = games.withColumn('genres', split(col('genres'), ';'))
uniq_genres = games.select('genres').distinct().collect()
unic_genres_set = set()
for u in uniq_genres:
    for i in u.genres:
        unic_genres_set.add(i)
unic_genres_set

for i in unic_genres_set:
    games = games.withColumn(i,when(array_contains(games.genres,i), 1).otherwise(0))

                                                                                

In [12]:
# create columns for the categories
games = games.withColumn('categories', split(col('categories'), ';'))
uniq_cat = games.select('categories').distinct().collect()
unic_cat_set = set()
for u in uniq_cat:
    for i in u.categories:
        # if not i in games2.columns:
        #     games2 = games2.withColumn(i, 0)
        unic_cat_set.add(i)
unic_cat_set

for i in unic_cat_set:
    games = games.withColumn(i,when(array_contains(games.categories,i), 1).otherwise(0))

22/04/12 21:31:34 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [13]:
# add the column to store the index of the row
games = games.select("*").withColumn("RowNumber", monotonically_increasing_id())
games.take(1)

[Row(appid=10, name='Counter-Strike', price=7.19, release_date=2000, required_age=0, publishers='Valve', developers='Valve', categories=['Multi-player', 'Online Multi-Player', 'Local Multi-Player', 'Valve Anti-Cheat enabled'], genres=['Action'], ratings=97.39, totalRatings=127873, average_playtime=17612.0, median_playtime=317, num_owners='10000000-20000000', Indie=0, Audio Production=0, Free to Play=0, Utilities=0, Photo Editing=0, Video Production=0, Nudity=0, Massively Multiplayer=0, Strategy=0, Animation & Modeling=0, Documentary=0, Education=0, Action=1, Sports=0, Casual=0, Design & Illustration=0, Web Publishing=0, Adventure=0, Violent=0, Sexual Content=0, RPG=0, Racing=0, Gore=0, Early Access=0, Game Development=0, Tutorial=0, Simulation=0, Accounting=0, Software Training=0, Includes Source SDK=0, Steam Cloud=0, Shared/Split Screen=0, Online Co-op=0, Partial Controller Support=0, Local Multi-Player=1, Commentary available=0, Single-player=0, Stats=0, Mods (require HL2)=0, Multi-p

In [14]:
# create a dataset with features that we need for the model
myGames=games.drop('appid',"developers","genres","name","publishers","categories","num_owners",)

In [15]:
myGames.head(1)

[Row(price=7.19, release_date=2000, required_age=0, ratings=97.39, totalRatings=127873, average_playtime=17612.0, median_playtime=317, Indie=0, Audio Production=0, Free to Play=0, Utilities=0, Photo Editing=0, Video Production=0, Nudity=0, Massively Multiplayer=0, Strategy=0, Animation & Modeling=0, Documentary=0, Education=0, Action=1, Sports=0, Casual=0, Design & Illustration=0, Web Publishing=0, Adventure=0, Violent=0, Sexual Content=0, RPG=0, Racing=0, Gore=0, Early Access=0, Game Development=0, Tutorial=0, Simulation=0, Accounting=0, Software Training=0, Includes Source SDK=0, Steam Cloud=0, Shared/Split Screen=0, Online Co-op=0, Partial Controller Support=0, Local Multi-Player=1, Commentary available=0, Single-player=0, Stats=0, Mods (require HL2)=0, Multi-player=1, MMO=0, Online Multi-Player=1, Includes level editor=0, Full controller support=0, Steam Workshop=0, VR Support=0, Cross-Platform Multiplayer=0, Local Co-op=0, Valve Anti-Cheat enabled=1, Captions available=0, Steam Tr

## First Model : Cosine Similarity

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
myGames2 = myGames.take(27000)

cos_sim_data = cosine_similarity(myGames2)
cos_sim_data

                                                                                

array([[1.00000000e+00, 8.97414146e-01, 8.89050930e-01, ...,
        3.76547723e-09, 3.99609567e-09, 6.07002769e-09],
       [8.97414146e-01, 1.00000000e+00, 9.99682562e-01, ...,
        2.25443188e-04, 2.25443395e-04, 2.25445218e-04],
       [8.89050930e-01, 9.99682562e-01, 1.00000000e+00, ...,
        4.63813819e-04, 4.63814024e-04, 4.63815831e-04],
       ...,
       [3.76547723e-09, 2.25443188e-04, 4.63813819e-04, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [3.99609567e-09, 2.25443395e-04, 4.63814024e-04, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [6.07002769e-09, 2.25445218e-04, 4.63815831e-04, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00]])

In [17]:
type(myGames2)

list

In [18]:
# temp = pd.DataFrame(cos_sim_data)
# cos_sim_df = spark.createDataFrame(temp)
# cos_sim_df.take(5)

In [19]:
# The users dataframe is not currently used.
#users_df = spark.read.csv(users_csv, header=True, inferSchema=True)
# Init the users/games relation dataframe.
users_games_df = spark.read.csv(filename2, header=True, inferSchema=True).drop("_c0")

# Feature assembler: make a vector of all features.
inputColsList = ["ratings", "price", "release_date", "totalRatings"]
inputColsList.extend(unic_cat_set)
inputColsList.extend(unic_genres_set)
vector_assembler = VectorAssembler().setInputCols(inputColsList).setOutputCol("features")
vectorized_games = vector_assembler.transform(games)

# Join the vectorized games with the user/games relation table.
users_games_joined_df = users_games_df.join(vectorized_games, "appid", "inner")

In [20]:
# // I need to create a function that get the appId and find the N top similar apps!!!
def generate_top_N_recommendationsX(appId, N=10):
    # appId = 0
    emp_RDD = spark.sparkContext.emptyRDD()
    row = games.select("RowNumber", "appid").rdd.filter(lambda x: x['appid'] == appId).collect()
    if len(row) == 0:
        columns = StructType([StructField('name',
                                  StringType(), True),
                    StructField('RowNumber',
                                IntegerType(), True),
                    StructField('index',
                                IntegerType(), True),
                    StructField('score',
                                DoubleType(), True)])
        df = spark.createDataFrame(data = emp_RDD,
                           schema = columns)
        return df
    row = row[0]['RowNumber']
    sim_scores = list(enumerate(cos_sim_data[row]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:N]
    game_indices = [i[0] for i in sim_scores]
    tempArray = []
    tempArrayScore =[]
    for val in range(0,len(game_indices)):
        tempArray.append((game_indices[val], val))
        tempArrayScore.append((game_indices[val],sim_scores[val][1]))
    tempArray = dict(tempArray)
    tempArrayScore= dict(tempArrayScore)
    names = games.where(col("RowNumber").isin(game_indices)).select("name","RowNumber").rdd.map(lambda row: (row[0], row[1],tempArray[row[1]],float(tempArrayScore[row[1]]) )).toDF(['name','RowNumber','index','score']).sort('index')
    return names

def GetValueFromDataframe(_df,columnName):
    for row in _df.rdd.collect():
        return row[columnName]

In [21]:
equalnames = generate_top_N_recommendationsX(str(10))
equalnames.show()

[Stage 11:>                                                         (0 + 2) / 2]

+----+---------+-----+-----+
|name|RowNumber|index|score|
+----+---------+-----+-----+
+----+---------+-----+-----+





In [22]:
def getRecommendationForUserCosineSim(user_id:int,num_recommendations:int =10):
    owned_games_df = users_games_df.filter(users_games_df.steam_id == user_id)
    emp_RDD = spark.sparkContext.emptyRDD()
    # Create an expected schema
    columns = StructType([StructField('name',
                                  StringType(), True),
                    StructField('RowNumber',
                                IntegerType(), True),
                    StructField('index',
                                IntegerType(), True),
                    StructField('score',
                                DoubleType(), True)])
    df = spark.createDataFrame(data = emp_RDD,
                           schema = columns)
    for row in owned_games_df.collect():
        df_temp = generate_top_N_recommendationsX(row.appid)
        df = df.union(df_temp)
    df.sort(col('score').desc()).show(num_recommendations)



In [23]:
getRecommendationForUserCosineSim(user_id=76561197960360459)

22/04/12 21:34:46 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB

+--------------------+---------+-----+------------------+
|                name|RowNumber|index|             score|
+--------------------+---------+-----+------------------+
| Dimension Hunter VR|     9981|    0|0.9999997790062494|
|  Lode Runner Legacy|    14449|    0|0.9999997723123002|
|   DUNGEONS OF CHAOS|    14000|    0|0.9999996607369963|
|     Fausts Alptraum|    12673|    0|0.9999996328669667|
| Necromancer Returns|    14542|    1|0.9999995668308166|
|             Blockle|    14558|    2|0.9999995110937684|
|Alien Arena: Warr...|    14482|    3|0.9999995074103457|
|         Beyond Eden|    14561|    4|0.9999994309347743|
|Game Character Hu...|    11068|    0|0.9999993911961074|
|      Mount Wingsuit|     9971|    1|0.9999993357544238|
+--------------------+---------+-----+------------------+
only showing top 10 rows





###  Normalizer 

In [31]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler
def normalzier(dataframe:DataFrame,columns:list)->DataFrame:
    transformVector_to_double = udf(lambda x: round(float(list(x)[0]),3), DoubleType())
    for i in columns:
        assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect")
        scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled")
        pipeline = Pipeline(stages=[assembler, scaler])
        dataframe = pipeline.fit(dataframe).transform(dataframe).withColumn(i+"_Scaled", transformVector_to_double(i+"_Scaled")).drop(i+"_Vect")
    print(type(dataframe))
    dataframe.show()
    return dataframe

## Model2 :  : Random Forest

In [32]:
def DoRandomForest(user_id, num_recommendations = 10, seed = 2022):
    # Get the data for the games the player has played.
    owned_games_df = users_games_joined_df.filter(users_games_joined_df.steam_id == user_id) #.filter(users_games_joined_df.time_played_in_minutes > 0)
    user_games_count = owned_games_df.count()
    print("Owned Games:", owned_games_df.count())
    if user_games_count > 0:
        training_df, testing_df = owned_games_df.randomSplit([0.8, 0.2], seed)
        norm=training_df.select("name","time_played_in_minutes")
        normalzier(dataframe=norm,columns=["time_played_in_minutes"])
        # Print the games this user has played the most (for comparison to final predictions).
        print("User's top games (from training set):")
        training_df.sort(training_df.time_played_in_minutes.desc()).select("name", "genres", "time_played_in_minutes").show()
        #owned_games_df.sort(owned_games_df.time_played_in_minutes.desc()).select("name", "genres", "time_played_in_minutes").show()

        # Extract the list of unowned games by removing the already owned ones.
        unowned_games_df = vectorized_games.join(owned_games_df, "appid", "leftanti")

        # Build Random Forest Regression based on time the user played each game in their library.
        randForest = RandomForestRegressor()
        model = randForest.fit(training_df.withColumnRenamed("time_played_in_minutes", "label"))

        # Run the model on unowned game data.
        predictions = model.transform(unowned_games_df).sort(col("prediction").desc())
        print("The user's top " + str(num_recommendations) + " predicted games (with predicted minutes played if they owned it):")
        predictions.select('appid', 'features', 'genres', 'name', 'prediction').show(num_recommendations)

        # Model evaluation via RMSE
        evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")
        testing_df = model.transform(testing_df.withColumnRenamed("time_played_in_minutes", "label"))
        rmse = evaluator.evaluate(testing_df, {evaluator.metricName: "rmse"})
        print("RMSE:", rmse)
        testing_df.select('name', 'label', 'prediction').show()

        # Determine the importance of each feature
        print("Feature Importances:")
        featuresArr = model.featureImportances.toArray()
        featureImportanceCombined = []
        for i in range(0, len(inputColsList)):
            featureImportanceCombined.append((inputColsList[i], featuresArr[i].item()))
        feature_df = spark.createDataFrame(featureImportanceCombined).toDF('feature', 'importance').sort('importance', ascending=False)
        feature_df.show()

In [33]:
DoRandomForest(76561197960360459)

Owned Games: 216


                                                                                

<class 'pyspark.sql.dataframe.DataFrame'>


                                                                                

+--------------------+----------------------+-----------------------------+
|                name|time_played_in_minutes|time_played_in_minutes_Scaled|
+--------------------+----------------------+-----------------------------+
|      Counter-Strike|                  1538|                        0.234|
|Team Fortress Cla...|                    42|                        0.006|
|  Deathmatch Classic|                     0|                          0.0|
|Half-Life: Opposi...|                     0|                          0.0|
|            Ricochet|                     0|                          0.0|
|           Half-Life|                     0|                          0.0|
|         Half-Life 2|                   481|                        0.073|
|Counter-Strike: S...|                    61|                        0.009|
|Half-Life 2: Lost...|                     0|                          0.0|
|Half-Life 2: Epis...|                     2|                          0.0|
|           

                                                                                

+--------------------+--------------------+----------------------+
|                name|              genres|time_played_in_minutes|
+--------------------+--------------------+----------------------+
|                DOOM|            [Action]|                  6566|
|          Anno 2070™|          [Strategy]|                  3850|
|           Tropico 5|[RPG, Simulation,...|                  3836|
|LEGO® Batman™ 3: ...| [Action, Adventure]|                  3699|
|Call of Duty®: Bl...|            [Action]|                  2774|
|     Game Dev Tycoon|[Casual, Indie, S...|                  2627|
|  Duke Nukem Forever|            [Action]|                  2432|
|    Castle Crashers®|[Action, Adventur...|                  2198|
|Jurassic World Ev...|[Simulation, Stra...|                  2025|
|Call of Duty®: In...| [Action, Adventure]|                  1922|
|Counter-Strike: G...|[Action, Free to ...|                  1884|
|      Rocket League®|[Action, Indie, R...|                  1

                                                                                

The user's top 10 predicted games (with predicted minutes played if they owned it):


                                                                                

+------+--------------------+--------------------+--------------------+------------------+
| appid|            features|              genres|                name|        prediction|
+------+--------------------+--------------------+--------------------+------------------+
|413150|(62,[0,1,2,3,5,8,...|[Indie, RPG, Simu...|      Stardew Valley|2926.3480012324844|
|582010|(62,[0,1,2,3,5,8,...|            [Action]|MONSTER HUNTER: W...| 2801.330814384891|
|242760|(62,[0,1,2,3,5,8,...|[Action, Adventur...|          The Forest|2675.7936182476947|
|438100|(62,[0,2,3,7,8,11...|[Adventure, Casua...|              VRChat| 2553.625254872564|
|322330|(62,[0,1,2,3,11,1...|[Adventure, Indie...|Don't Starve Toge...| 2410.713249178982|
|286160|(62,[0,1,2,3,5,6,...|[Casual, Indie, R...|  Tabletop Simulator| 2365.056740196079|
|359550|(62,[0,1,2,3,8,11...|            [Action]|Tom Clancy's Rain...|2357.9481552229045|
|552520|(62,[0,1,2,3,8,11...| [Action, Adventure]|          Far Cry® 5|2309.1445763305323|

                                                                                

RMSE: 910.6252111854186


[Stage 1035:>                                                       (0 + 1) / 1]

+--------------------+-----+------------------+
|                name|label|        prediction|
+--------------------+-----+------------------+
|       Day of Defeat|    0|24.411534904289887|
|Half-Life: Blue S...|    0|20.817009169309046|
|Half-Life 2: Deat...|    0|24.411534904289887|
|Half-Life Deathma...|    0|14.899846054272265|
|       Left 4 Dead 2| 2002|354.93022788647914|
|              Dota 2| 2324|1679.3457826096078|
|         Garry's Mod|    0|  993.566078892372|
|Trackmania United...| 1099| 280.3155187168669|
|Runaway, A Road A...|   19| 17.23413747902628|
|                RAGE|  103|  392.650269733016|
|Grand Theft Auto:...|  105| 142.1050838380864|
|           Max Payne|    0|20.937009169309043|
|            Far Cry®|    0| 21.08932398412386|
|Tom Clancy's Rain...|    0| 20.13445789408295|
|Tom Clancy's Spli...|    0| 19.67242030730911|
|Tom Clancy's Spli...|   18|  30.5993729216882|
|Tom Clancy's Rain...|    0|31.677006415144056|
|Tom Clancy's Rain...|    6| 58.10258379

                                                                                

In [34]:
# Kyle's Steam ID
DoRandomForest(76561197982716241)

Owned Games: 136


                                                                                

<class 'pyspark.sql.dataframe.DataFrame'>


                                                                                

+--------------------+----------------------+-----------------------------+
|                name|time_played_in_minutes|time_played_in_minutes_Scaled|
+--------------------+----------------------+-----------------------------+
|      Counter-Strike|                  1969|                        0.065|
|Team Fortress Cla...|                    19|                        0.001|
|           Half-Life|                  3298|                        0.109|
|Counter-Strike: C...|                   119|                        0.004|
|Half-Life: Blue S...|                   386|                        0.013|
|         Half-Life 2|                   675|                        0.022|
|   Half-Life: Source|                     0|                          0.0|
|Day of Defeat: So...|                     0|                          0.0|
|Half-Life 2: Lost...|                    14|                          0.0|
|Half-Life 2: Epis...|                   295|                         0.01|
|           

                                                                                

+--------------------+--------------------+----------------------+
|                name|              genres|time_played_in_minutes|
+--------------------+--------------------+----------------------+
|Counter-Strike: G...|[Action, Free to ...|                 30157|
|Sid Meier's Civil...|          [Strategy]|                 17603|
|       Killing Floor|            [Action]|                 16366|
|Kerbal Space Program| [Indie, Simulation]|                 11131|
|MechWarrior Onlin...|[Action, Free to ...|                  9815|
|Sid Meier’s Civil...|          [Strategy]|                  9192|
|Sins of a Solar E...|   [Indie, Strategy]|                  8750|
|Red Orchestra 2: ...|[Action, Massivel...|                  6684|
|             XCOM® 2|          [Strategy]|                  5412|
|Supreme Commander...|          [Strategy]|                  5283|
|Defense Grid: The...|   [Indie, Strategy]|                  5262|
|          BATTLETECH|[Action, Adventur...|                  4

                                                                                

The user's top 10 predicted games (with predicted minutes played if they owned it):


                                                                                

+------+--------------------+--------------------+--------------------+------------------+
| appid|            features|              genres|                name|        prediction|
+------+--------------------+--------------------+--------------------+------------------+
|   570|(62,[0,2,3,14,19,...|[Action, Free to ...|              Dota 2|13588.201947713533|
|304930|(62,[0,2,3,5,7,11...|[Action, Adventur...|            Unturned|11137.970448717948|
|299360|(62,[0,2,3,11,12,...|[Action, Free to ...|        Block N Load|10756.104447923533|
|252490|(62,[0,1,2,3,7,12...|[Action, Adventur...|                Rust|10695.020454545454|
|346330|(62,[0,2,3,4,7,8,...|[Action, Adventur...|        BrainBread 2|  9667.52585907336|
|204300|(62,[0,2,3,5,6,11...|[Action, Free to ...|Awesomenauts - th...| 9661.727583737584|
| 17570|(62,[0,2,3,4,5,12...|[Action, Free to ...|Pirates, Vikings,...| 9442.266216216216|
|312280|(62,[0,2,3,5,11,1...|[Casual, Free to ...|        Simply Chess| 8322.804196875919|

                                                                                

RMSE: 2054.229303356947


[Stage 1072:>                                                       (0 + 1) / 1]

+--------------------+-----+------------------+
|                name|label|        prediction|
+--------------------+-----+------------------+
|Half-Life: Opposi...| 1836| 760.3416032681887|
|Counter-Strike: S...| 1959| 5644.819064828849|
|Half-Life 2: Deat...|   10| 543.1726096784452|
|Half-Life Deathma...|    0|437.83465076236115|
|       Left 4 Dead 2|    0| 6628.793365384615|
|            Portal 2| 1135|  4271.70584345479|
|          Multiwinia|    0|217.32496986507408|
|      Wolfenstein 3D|  224| 455.8661828822866|
|       Ultimate Doom|    3| 265.7000527689358|
|            QUAKE II|  811|405.84047392457757|
|Star Wars: Battle...| 1895|   874.22935013722|
|Civilization IV: ...|    0|1205.0256211015042|
|Call of Duty: Wor...|   33|3052.8929777024305|
|Unreal 2: The Awa...|  491| 289.7360185686133|
|Unreal Tournament...| 6126| 2086.085240215375|
|Unreal Tournament...| 5266|1179.1666337815784|
|            Far Cry®|  377|362.31074220061635|
|Sid Meier's Civil...|    0| 596.2302863

                                                                                