# Importing Libraries

In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer,IndexToString
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pymongo import MongoClient
import pandas as pd 

# Initialising Spark Session

In [2]:
spark = SparkSession.builder \
      .appName("Word count") \
      .getOrCreate()


# Collecting data from MongoDB Atlas Cluster

In [3]:
# FETCH DATA
start_time = time.time()
client = MongoClient("MongoDB Collection URL Here") # Collection URL Not specified for Data Security
db = client.get_database('steam_data')
user_like_dataa=db.user_likes.find({},{'user_id':1,'user_data':1})
user_like_dataa=[i for i in user_like_dataa]
df = pd.DataFrame(columns = ['user_id', 'game_id', 'rate', 'percent', 'liked']) 
for i in user_like_dataa:
    for j in i['user_data']:
        df = df.append({'user_id' : i['user_id'], 'game_id' : j['game_id'], 'rate':j['rate'], 'percent':j['percent'], 'liked':j['liked']},  
                ignore_index = True) 
recom_mongo_data=df
print("--- %s seconds ---" % (time.time() - start_time))

--- 14.41849422454834 seconds ---


## Sample User data

In [4]:
user_like_dataa[1]
#should insert likes data in this format

{'_id': ObjectId('5fafc994336c36400e1135f3'),
 'user_id': 1,
 'user_data': [{'game_id': 560130, 'rate': 6, 'percent': 91, 'liked': 0},
  {'game_id': 362490, 'rate': 5, 'percent': 92, 'liked': 1},
  {'game_id': 6020, 'rate': 7, 'percent': 100, 'liked': 1},
  {'game_id': 6000, 'rate': 5, 'percent': 98, 'liked': 0},
  {'game_id': 404013, 'rate': 7, 'percent': 78, 'liked': 0},
  {'game_id': 311210, 'rate': 8, 'percent': 97, 'liked': 0},
  {'game_id': 221380, 'rate': 5, 'percent': 10, 'liked': 0},
  {'game_id': 632470, 'rate': 8, 'percent': 96, 'liked': 0},
  {'game_id': 1162130, 'rate': 6, 'percent': 70, 'liked': 1},
  {'game_id': 311560, 'rate': 4, 'percent': 78, 'liked': 0},
  {'game_id': 1379930, 'rate': 8, 'percent': 74, 'liked': 0},
  {'game_id': 1071200, 'rate': 4, 'percent': 32, 'liked': 0},
  {'game_id': 586140, 'rate': 8, 'percent': 98, 'liked': 0},
  {'game_id': 704450, 'rate': 8, 'percent': 97, 'liked': 1},
  {'game_id': 763890, 'rate': 2, 'percent': 10, 'liked': 0},
  {'game_id

# Recommender Function

In [4]:
def recom_engine(spark, df_name, column_name):
    df = spark.createDataFrame(df_name)
    df=df.na.drop()
    stringIndexer = StringIndexer(inputCol="game_id", outputCol="game_id_new")
    model = stringIndexer.fit(df)
    indexed = model.transform(df)
    train=indexed
    rec=ALS(maxIter=10,regParam=0.01,userCol='user_id',itemCol='game_id_new',ratingCol=column_name,nonnegative=True,coldStartStrategy="drop")
    rec_model=rec.fit(train)
    unique_games=indexed.select('game_id_new').distinct()
    return([unique_games,indexed, rec_model, model])

# Using model From Recommender Funciton

In [6]:
def top_games(dataa, user_id,n):
    a = dataa[0].alias('a')
    indexed=dataa[1]
    rec_model=dataa[2]
    model=dataa[3]
    watched_games=indexed.filter(indexed['user_id'] == user_id).select('game_id_new')
    b=watched_games.alias('b')
    total_games = a.join(b, a.game_id_new == b.game_id_new,how='left')
    remaining_games=total_games.where(col("b.game_id_new").isNull()).select(a.game_id_new).distinct()
    remaining_games=remaining_games.withColumn("user_id",lit(int(user_id)))
    recommendations=rec_model.transform(remaining_games).orderBy('prediction',ascending=False).limit(n)
    movie_title = IndexToString(inputCol="game_id_new", outputCol="game_id",labels=model.labels)
    final_recommendations=movie_title.transform(recommendations)
    return(final_recommendations.limit(n).toPandas())

# Recommending Games
    - Recommended based on 3 Parameters
        - Rating Given by the user
        - Percent of Users Liked
        - Liking of a user

In [7]:
by_rate=top_games(recom_engine(spark, recom_mongo_data, 'rate'), 4,10)
by_percent=top_games(recom_engine(spark, recom_mongo_data, 'percent'), 4,10)
by_liked=top_games(recom_engine(spark, recom_mongo_data, 'liked'), 4,10)


In [8]:
print('Prediction by Rate : '+str(list(by_rate['game_id'])))
print('Prediction by Percent : '+str(list(by_percent['game_id'])))
print('Prediction by Likes : '+str(list(by_liked['game_id'])))


Prediction by Rate : ['760060', '613100', '920690', '215280', '205650', '445190', '1148590', '503580', '494840', '673610']
Prediction by Percent : ['1179210', '774461', '418370', '947930', '1162750', '32460', '553310', '323190', '6020', '440']
Prediction by Likes : ['1222730', '15610', '548430', '971996', '1159060', '214490', '1382220', '397540', '606150', '976730']
