# Content-based Recommendation On Steam Games

## Imports
---

In [47]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# import stemmer from sklearn
from nltk.stem import PorterStemmer
from ast import literal_eval

# import one-hot encoder 
from sklearn.preprocessing import MultiLabelBinarizer

# Function to reduce the memory usage of a DataFrame.
def reduce_memory(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

# Generator function to load data in chunks.
def data_generator(df, chunksize=10000):
    for i in range(0, df.shape[0], chunksize):
        yield df.iloc[i:i+chunksize]

# Load data
games_df = reduce_memory(pd.read_csv('./data/games.csv'))
users_df = reduce_memory(pd.read_csv('./data/users.csv'))
recommendations_df = reduce_memory(pd.read_csv('./data/recommendations.csv'))

In [48]:
games_df.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True


In [49]:
users_df.head()

Unnamed: 0,user_id,products,reviews
0,7360263,359,0
1,14020781,156,1
2,8762579,329,4
3,4820647,176,4
4,5167327,98,2


In [50]:
recommendations_df.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,975370,0,0,2022-12-12,True,36.299999,51580,0
1,304390,4,0,2017-02-17,False,11.5,2586,1
2,1085660,2,0,2019-11-17,True,336.5,253880,2
3,703080,0,0,2022-09-23,True,27.4,259432,3
4,526870,0,0,2021-01-10,True,7.9,23869,4


In [51]:
games_df.shape

(50872, 13)

In [52]:
users_df.shape

(14306064, 3)

In [53]:
recommendations_df.shape

(41154794, 8)

## Data visualization
---

## Pre-processing
---

## Experiment One - Modeling
---

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Idrk what this does completely, just good for the text 
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=1, stop_words='english')
tfidf_matrix = tf.fit_transform(games_df['title'])

# Content-Based Filtering
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Get top 5 recommendations for the first game
similar_indices = cosine_similarities[0].argsort()[:-6:-1]
recommended_games = [games_df['title'].iloc[i] for i in similar_indices]
print(f'Recommended games for {games_df['title'].iloc[0]} are: {recommended_games}')

Recommended games for Prince of Persia: Warrior Within™ are: ['Prince of Persia: Warrior Within™', 'Prince of Persia®', 'Prince of Persia: The Two Thrones™', 'Prince of Persia®: The Sands of Time', 'Prince of Persia: The Forgotten Sands™']


In [55]:
# Get top 5 recommendations for the second game
similar_indices = cosine_similarities[1].argsort()[:-6:-1]
recommended_games = [games_df['title'].iloc[i] for i in similar_indices]
print(f'Recommended games for {games_df['title'].iloc[1]} are: {recommended_games}')

Recommended games for BRINK: Agents of Change are: ['BRINK: Agents of Change', 'BRINK', 'Change', 'BRINK Traveler', 'Agents of Mayhem']


## Evaulation
---

## Storytelling
---

## Conclusion
---