In [1]:
import json
import os 
import numpy as np
import pandas as pd
import requests
import time 
# Spark imports
from pyspark import SparkContext
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc


In [2]:
def init_spark():
    spk = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spk


In [3]:
#Create Spark Session
spk = init_spark()

22/03/28 21:13:52 WARN Utils: Your hostname, Sams-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 10.0.0.108 instead (on interface en0)
22/03/28 21:13:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/28 21:13:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Constants

In [4]:
# Global Variables 
url_to_get_app_information="https://store.steampowered.com/api/appdetails?appids="
dictionnary = {}
path = os.getcwd()
directory_path=os.path.abspath(os.path.join(path, os.pardir))
parentPath = os.path.abspath(os.path.join(directory_path,os.pardir))

## Generate Data For Base Game DataSet

In [5]:
def generateData():
    path_to_write = parentPath+'/data/datasets'
    originalDataSet = pd.read_csv(parentPath+'/data/datasets/steam.csv')
    if os.path.exists(path_to_write+'/games.csv'):
        return pd.read_csv(path_to_write+'/games.csv')
    for row in originalDataSet.iterrows():
        row =row[1]
        totalRatings = (row['positive_ratings'] + row['negative_ratings'])
        if totalRatings == 0:
            ratings = 0
        else:
            ratings = round((row['positive_ratings']/totalRatings) * 100,2)
        dictionnary[row['appid']] = {
                    "name": row['name'],
                    "price": row['price'],
                    "release_date": row['release_date'],
                    "required_age": row['required_age'],
                    "publishers": row['publisher'],
                    "developers": row['developer'],
                    "categories": row['categories'],
                    "genres": row['genres'],
                    "ratings": ratings,
                    "totalRatings": totalRatings,
                    "average_playtime": row['average_playtime'],
                    "median_playtime": row['median_playtime'],
                    "num_owners": row['owners']
        }
    
    

    df = pd.DataFrame.from_dict(dictionnary, orient='index')
    df.index.name = 'appid'
    if not os.path.exists(path_to_write):
        os.mkdir(path_to_write)
    
    df.to_csv(path_to_write+'/games.csv')
    return df

In [6]:
game_dataframe=spk.createDataFrame(generateData())

## Generate the User Steam Dataset

In [7]:
url_user_info = "https://api.steampowered.com/IPlayerService/GetOwnedGames/v1/?key=D399B9CAE07939881781DD36CC8CC442&steamid={}&include_appinfo=true&include_played_free_games=true"
def generateDataForUserSteamDataset():
    dictionary=[]
    path_to_write = parentPath+'/data/datasets'
    if os.path.exists(path_to_write+'/steam_id_games.csv'):
        return pd.read_csv(path_to_write+'/steam_id_games.csv')
    originalDataSet = pd.read_csv(parentPath+'/data/datasets/steam_id.csv')
    for row in originalDataSet.iterrows():
        row =row[1]
        try:
            games = requests.get(url_user_info.format(row['steamid_a'])).json()['response']
        except:
            continue
        if 'games' not in games:
            continue
        dataset = [{'steam_id':row['steamid_a'],"appid" :str(game['appid']), "time_played_in_minutes": game['playtime_forever'] } for game in games['games']]
        dictionary = dataset+dictionary
    df = pd.DataFrame.from_records(dictionary)
    if not os.path.exists(path_to_write):
        os.mkdir(path_to_write)
    
    df.to_csv(path_to_write+'/steam_id_games.csv')
    return df
   

In [8]:
user_dataframe = spk.createDataFrame(generateDataForUserSteamDataset())

## Cosine Similarity

### Cleaning Data for Cosine Similarity

In [33]:
def cleanData(row):
    if ';' in row:
        return row.split(';')
    else:
        return row

In [35]:
def cosine_similarity(game_dataframe):
    features = ['publishers',"developers", "categories","genres","ratings"]
    deep_copy = game_dataframe.alias('deepCopyDf')
    finalVal = deep_copy.rdd.map(lambda x: (x['appid'],''.join(cleanData(x['publishers'])) + ' '+''.join(cleanData(x['developers'])) + ' '+''.join(cleanData(x['categories'])) + ' '+''.join(cleanData(x['genres'])) + ' '+str(x['ratings'])))
    finalVal=finalVal.toDF(["appid","result"])
    finalVal.show()

In [None]:
cosine_similarity(game_dataframe)

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

ModuleNotFoundError: No module named 'sklearn'

In [None]:
tf = TfidfVectorizer()
tfidf_matrix = tf.fit_transform()