<a href="https://colab.research.google.com/github/AUT-Student/BigData-HW2/blob/main/BigData_HW2_Q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
import pandas as pd
import numpy as np

# Dataset

In [2]:
!gdown 1PdXgb4w0gtsocKHmeGQ_zPfb3VbW9YhL
!unzip /content/Bigdata_hw2_datasets.zip

Downloading...
From: https://drive.google.com/uc?id=1PdXgb4w0gtsocKHmeGQ_zPfb3VbW9YhL
To: /content/Bigdata_hw2_datasets.zip
100% 6.28M/6.28M [00:00<00:00, 17.2MB/s]
Archive:  /content/Bigdata_hw2_datasets.zip
   creating: Bigdata_hw2_datasets/
   creating: Bigdata_hw2_datasets/q1/
  inflating: Bigdata_hw2_datasets/q1/stream_data_dgim.txt  
   creating: Bigdata_hw2_datasets/q2/
  inflating: Bigdata_hw2_datasets/q2/games.csv  
  inflating: Bigdata_hw2_datasets/q2/ratings.csv  
   creating: Bigdata_hw2_datasets/q3/
  inflating: Bigdata_hw2_datasets/q3/c1.txt  
  inflating: Bigdata_hw2_datasets/q3/c2.txt  
  inflating: Bigdata_hw2_datasets/q3/data.txt  


In [3]:
games_dataset = pd.read_csv("/content/Bigdata_hw2_datasets/q2/games.csv")

In [4]:
games_dataset = games_dataset[games_dataset["game_id"].notna()]

In [5]:
ratings_dataset = pd.read_csv("/content/Bigdata_hw2_datasets/q2/ratings.csv")

In [6]:
ratings_dataset = ratings_dataset.drop_duplicates(["user_id", "game_id"], keep="last")

# Auxililary Functions

In [7]:
def rated_item_list(user_id):
  return ratings_dataset[ratings_dataset["user_id"]==user_id]["game_id"].values

In [8]:
def cosine_similarity(user_x, user_y):
  if user_x == user_y:
    return 1

  user_x_rates = ratings_dataset[ratings_dataset["user_id"]==user_x]["rating"].values
  user_y_rates = ratings_dataset[ratings_dataset["user_id"]==user_y]["rating"].values

  user_x_norm = sum([rate**2 for rate in user_x_rates]) ** 0.5 
  user_y_norm = sum([rate**2 for rate in user_y_rates]) ** 0.5 

  common_item_list = np.intersect1d(rated_item_list(user_x), rated_item_list(user_y))

  minimal_ratings_dataset = ratings_dataset[ratings_dataset["game_id"].isin(common_item_list) & ratings_dataset["user_id"].isin([user_x, user_y])]

  user_x_y_dot = 0
  for item_id in common_item_list:
    rate_1, rate_2 = minimal_ratings_dataset[minimal_ratings_dataset["game_id"]==item_id]["rating"].values
    user_x_y_dot += rate_1 * rate_2

  return user_x_y_dot / (user_x_norm * user_y_norm)

# Most Similar Users

In [9]:
def most_similar_users(user):
  all_user_id_list = set(ratings_dataset["user_id"].values)
  
  similarity_scores = []

  for i, user_id in enumerate(all_user_id_list):
    if i%2000==0: print(int(i*100/len(all_user_id_list)), "%")
    if user_id == user: continue

    score = cosine_similarity(user, user_id)
    similarity_scores.append({"score": score, "user-id": user_id})

  similarity_scores = sorted(similarity_scores, key=lambda x: -x["score"])
  return similarity_scores

# Prediction

In [10]:
def predict_rate(similar_users, item_id, number):
  minimal_ratings_dataset = ratings_dataset[ratings_dataset["game_id"]==item_id]

  rater_user_id_list = minimal_ratings_dataset["user_id"].values

  sum_score = 0
  sum_rate_score = 0
  number_users = 0 

  for record in similar_users:
    user_id = record["user-id"]
    score = record["score"]
    if user_id in rater_user_id_list:
      number_users += 1
      rate = minimal_ratings_dataset[minimal_ratings_dataset["user_id"]==user_id]["rating"].values[0]
      sum_rate_score += score * rate
      sum_score += score
            
      if number_users == number:
        break

  if sum_score==0:
    return None

  rate = sum_rate_score / sum_score
  
  rate = round(rate, 3)

  return rate

# Recommendation

In [16]:
def recommend(user_id, number_recommendation):
  similar_users = most_similar_users(user_id)

  exist_rate = ratings_dataset[ratings_dataset["user_id"] == user_id]["game_id"].values
  
  rate_list = []
  for item_id in range(1, 10001):
    if item_id%1000==0: print(int(item_id/100), "%")
    if item_id in exist_rate: continue

    rate = predict_rate(similar_users, item_id, 10)

    if rate is not None:
      rate_list.append({"rate": rate, "item-id": item_id})

  rate_list = sorted(rate_list, key=lambda x: (-x["rate"], item_id))[:number_recommendation]

  output_list = []

  for record in rate_list:
    item_id = record["item-id"]
    rate = record["rate"]

    name = games_dataset[games_dataset["game_id"] == item_id]["name"].values[0]

    output_list.append({"name": name, "game_id": item_id, "rate": rate})

  return output_list

# Results

In [None]:
recommend(user_id=5461, number_recommendation=5)

0 %
3 %
7 %


In [None]:
recommend(user_id=10140, number_recommendation=5)