In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import joblib


In [2]:
# Get data
def get_steam_data(file_path:str) -> pd.DataFrame:
  column_names = ['user_id', 'item_id', 'behaviour', 'hours']
  df = pd.read_csv(file_path, header=None, names=column_names, usecols=range(4))
  return df


In [3]:
df = get_steam_data('..\data\steam-200k.csv')
df.head()

Unnamed: 0,user_id,item_id,behaviour,hours
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0
1,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,purchase,1.0
3,151603712,Fallout 4,play,87.0
4,151603712,Spore,purchase,1.0


In [4]:
df_play = df.query('behaviour == "play"')
df_play.head() # vamos filtrar apenas as horas jogadas

Unnamed: 0,user_id,item_id,behaviour,hours
1,151603712,The Elder Scrolls V Skyrim,play,273.0
3,151603712,Fallout 4,play,87.0
5,151603712,Spore,play,14.9
7,151603712,Fallout New Vegas,play,12.1
9,151603712,Left 4 Dead 2,play,8.9


In [5]:
df_user_total_hours = df_play.groupby(['user_id'])['hours'].sum().reset_index()
df_user_total_hours.head() # agore tenho as horas totais jogadas por cada jogador

Unnamed: 0,user_id,hours
0,5250,225.5
1,76767,1227.0
2,86540,255.0
3,144736,0.1
4,181212,2.2


In [6]:
df_games_played = df_play.groupby(['user_id'])['item_id'].count().reset_index() 
df_games_played.head()# este df, tem a quantidade de jogos jogados por cada usuário

Unnamed: 0,user_id,item_id
0,5250,6
1,76767,20
2,86540,15
3,144736,1
4,181212,2


In [7]:
df_user_mean_hours = df_games_played.merge(df_user_total_hours, on='user_id')
df_user_mean_hours['mean_hours'] = df_user_mean_hours['hours']/df_user_mean_hours['item_id']
df_user_mean_hours = df_user_mean_hours.drop(columns=['item_id', 'hours'])
df_user_mean_hours.head() # este df contém as horas médias que cada usuário gasta com os jogos jogados

Unnamed: 0,user_id,mean_hours
0,5250,37.583333
1,76767,61.35
2,86540,17.0
3,144736,0.1
4,181212,1.1


In [8]:
df_play = df_play.merge(df_user_mean_hours, on='user_id',how='left')
df_play = df_play.drop(columns=['behaviour'])
df_play.head()

Unnamed: 0,user_id,item_id,hours,mean_hours
0,151603712,The Elder Scrolls V Skyrim,273.0,16.919231
1,151603712,Fallout 4,87.0,16.919231
2,151603712,Spore,14.9,16.919231
3,151603712,Fallout New Vegas,12.1,16.919231
4,151603712,Left 4 Dead 2,8.9,16.919231


In [9]:
df_play['rating']= df_play['hours']/df_play['mean_hours']
df_play.head()
# foi criada uma forma de avaliação implícita em que as horas jogadas de cada jogo é dividida pelas horas médias em que o jogador gasta em cada jogo que ele jogou
# esta métrica pesa muito quando o jogo prende o jogador por muitas horas
# caso o jogador tenha mais jogos, a métrica vai disparar para jogares que jogaram muitos jogos e concentraram horas em algum jogo específico.
# este rating não avalia q qualidade do jogo em si, e sim o quão viciante ele é

Unnamed: 0,user_id,item_id,hours,mean_hours,rating
0,151603712,The Elder Scrolls V Skyrim,273.0,16.919231,16.135485
1,151603712,Fallout 4,87.0,16.919231,5.142078
2,151603712,Spore,14.9,16.919231,0.880655
3,151603712,Fallout New Vegas,12.1,16.919231,0.715163
4,151603712,Left 4 Dead 2,8.9,16.919231,0.526029


In [10]:
df_games = df_play.groupby(['item_id'])['hours'].sum().reset_index() 
df_games.head() # este será o dataframe dos games, de inicio estou somando horas jogadas em cada um deles

Unnamed: 0,item_id,hours
0,007 Legends,0.7
1,0RBITALIS,1.2
2,1... 2... 3... KICK IT! (Drop That Beat Like a...,20.0
3,10 Second Ninja,5.9
4,10000000,3.6


In [11]:
df_games = df_games.merge(df_play.groupby(['item_id'])['rating'].mean().reset_index(), on='item_id')
df_games.head() # agora tenho a média de rating para cada jogo

Unnamed: 0,item_id,hours,rating
0,007 Legends,0.7,0.014867
1,0RBITALIS,1.2,0.027865
2,1... 2... 3... KICK IT! (Drop That Beat Like a...,20.0,0.369785
3,10 Second Ninja,5.9,0.227901
4,10000000,3.6,0.262751


In [12]:
df_games = df_games.merge(df.query('behaviour == "purchase"').groupby(['item_id'])['behaviour'].count().reset_index(),
                          on='item_id')
df_games.head() # compras feitas em cada jogo

Unnamed: 0,item_id,hours,rating,behaviour
0,007 Legends,0.7,0.014867,1
1,0RBITALIS,1.2,0.027865,3
2,1... 2... 3... KICK IT! (Drop That Beat Like a...,20.0,0.369785,7
3,10 Second Ninja,5.9,0.227901,6
4,10000000,3.6,0.262751,1


In [13]:
df_games = df_games.rename(columns={
    'hours': 'total_hours',
    'rating': 'mean_rating',
    'behaviour': 'total_purchases'
})
df_games.head()

Unnamed: 0,item_id,total_hours,mean_rating,total_purchases
0,007 Legends,0.7,0.014867,1
1,0RBITALIS,1.2,0.027865,3
2,1... 2... 3... KICK IT! (Drop That Beat Like a...,20.0,0.369785,7
3,10 Second Ninja,5.9,0.227901,6
4,10000000,3.6,0.262751,1


In [14]:
# top10 jogos mais jogados
df_games.sort_values(by='total_hours',ascending=False).head(10)

Unnamed: 0,item_id,total_hours,mean_rating,total_purchases
922,Dota 2,981684.6,1.756914,4841
673,Counter-Strike Global Offensive,322771.6,3.918016,1412
2994,Team Fortress 2,173673.3,1.783639,2323
670,Counter-Strike,134261.1,1.596892,856
2691,Sid Meier's Civilization V,99821.3,2.589208,596
675,Counter-Strike Source,96075.5,1.883556,978
3067,The Elder Scrolls V Skyrim,70889.3,2.958108,717
1313,Garry's Mod,49725.3,2.281203,731
490,Call of Duty Modern Warfare 2 - Multiplayer,42009.9,2.38316,343
1733,Left 4 Dead 2,33596.7,1.147005,951


In [15]:
# top10 jogos mais comprados
df_games.sort_values(by='total_purchases',ascending=False).head(10)

Unnamed: 0,item_id,total_hours,mean_rating,total_purchases
922,Dota 2,981684.6,1.756914,4841
2994,Team Fortress 2,173673.3,1.783639,2323
3350,Unturned,16096.4,0.926801,1563
673,Counter-Strike Global Offensive,322771.6,3.918016,1412
1448,Half-Life 2 Lost Coast,184.4,0.122918,981
675,Counter-Strike Source,96075.5,1.883556,978
1733,Left 4 Dead 2,33596.7,1.147005,951
670,Counter-Strike,134261.1,1.596892,856
3426,Warframe,27074.6,1.700296,847
1445,Half-Life 2 Deathmatch,3712.9,0.359893,823


In [16]:
# top10 jogos mais bem avaliados
df_games.sort_values(by='mean_rating',ascending=False).head(10)
# esta avaliação não fez tanto sentido, pois pode-se observar que os jogos deste top tem poucas compras

Unnamed: 0,item_id,total_hours,mean_rating,total_purchases
1119,FIFA Manager 09,411.0,18.564076,1
2146,Out of the Park Baseball 16,660.8,12.209546,2
354,Black Ink,239.3,10.681597,3
1973,Movie Studio 13 Platinum - Steam Powered,166.0,7.145301,1
1563,Imperial Glory,170.0,6.830738,4
368,BlazBlue Continuum Shift Extend,233.8,5.410861,4
2909,Sunrider Academy,43.0,5.402448,1
2145,Out of the Park Baseball 15,549.2,5.043428,6
3505,Worldwide Soccer Manager 2009,295.0,4.986543,1
1120,FIFA Manager 11,229.0,4.863753,1


In [17]:
# top10 jogos mais bem avaliados
df_games[df_games['total_purchases'] > 800].sort_values(by='mean_rating',ascending=False).head(10)
# considerando jogos com mais de 800 compras, se tem um top que faz mais sentido

Unnamed: 0,item_id,total_hours,mean_rating,total_purchases
673,Counter-Strike Global Offensive,322771.6,3.918016,1412
675,Counter-Strike Source,96075.5,1.883556,978
2994,Team Fortress 2,173673.3,1.783639,2323
922,Dota 2,981684.6,1.756914,4841
3426,Warframe,27074.6,1.700296,847
670,Counter-Strike,134261.1,1.596892,856
1733,Left 4 Dead 2,33596.7,1.147005,951
3350,Unturned,16096.4,0.926801,1563
1445,Half-Life 2 Deathmatch,3712.9,0.359893,823
1448,Half-Life 2 Lost Coast,184.4,0.122918,981


Os tops indicados seriam tops de todos os tempos, a maioria dos jogos que aprecem neles são gratuitos para jogar, enquanto outros são clássicos baratos.

In [18]:
df_play.head()

Unnamed: 0,user_id,item_id,hours,mean_hours,rating
0,151603712,The Elder Scrolls V Skyrim,273.0,16.919231,16.135485
1,151603712,Fallout 4,87.0,16.919231,5.142078
2,151603712,Spore,14.9,16.919231,0.880655
3,151603712,Fallout New Vegas,12.1,16.919231,0.715163
4,151603712,Left 4 Dead 2,8.9,16.919231,0.526029


Vamos criar uma tabela de avaliação:
* (Rating Matrix)
Crie uma tabela onde as linhas são os item_id (jogos) e as colunas são os user_id (usuários), e os valores são os ratings implícitos.

In [19]:
# Cria a tabela pivot
df_play_drop = df_play.drop(columns=['hours','mean_hours'])

In [21]:
for i in df_play_drop['user_id'].unique(): # esta função procura duplicatas de jogos para cada usuário
    uid = "user_id == " + str(i)
    ps = df_play_drop.query(uid).duplicated('item_id')
    has_true_values = ps.any()
    if has_true_values == True:
        print(i)

118664413
50769696
71411882
33865373
71510748
28472068
59925638
148362155
176261926


In [22]:
lista_duplicatas = [118664413,
50769696,
71411882,
33865373,
71510748,
28472068,
59925638,
148362155,
176261926
]


In [23]:
for i in lista_duplicatas:
    user_data = df_play_drop[df_play_drop['user_id'] == i]
    user_data_no_duplicates = user_data.drop_duplicates(subset='item_id', keep='first')
    df_play_drop.loc[user_data.index, :] = user_data_no_duplicates

In [24]:
df_play_drop = df_play_drop.dropna()

In [25]:
for i in df_play_drop['user_id'].unique(): # esta função procura duplicatas de jogos para cada usuário
    uid = "user_id == " + str(i)
    ps = df_play_drop.query(uid).duplicated('item_id')
    has_true_values = ps.any()
    if has_true_values == True:
        print(i)

        # pronto agora não tem mais duplicata

In [26]:
ratings_matrix = df_play_drop.pivot(index='item_id', columns='user_id', values='rating').fillna(0)
ratings_matrix.head()

user_id,5250.0,76767.0,86540.0,144736.0,181212.0,229911.0,298950.0,381543.0,547685.0,554278.0,...,309228590.0,309255941.0,309262440.0,309265377.0,309404240.0,309434439.0,309554670.0,309626088.0,309824202.0,309903146.0
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
007 Legends,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0RBITALIS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Second Ninja,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# Calcula a similaridade entre itens
item_similarity = cosine_similarity(ratings_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)
item_similarity_df.head()

item_id,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,12 Labours of Hercules III Girl Power,...,rFactor,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,the static speaks my name,theHunter,theHunter Primal
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
007 Legends,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0RBITALIS,0.0,1.0,0.0,0.082879,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005016,0.0
1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000146,0.0,0.0,0.0
10 Second Ninja,0.0,0.082879,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
def recommend_items(item_id, item_similarity_df, num_recommendations=5):
    similar_items = item_similarity_df[item_id].sort_values(ascending=False) # ele pega a coluna do jogo e ordena ela
    similar_items = similar_items.drop(item_id)  # Remove o item atual
    return similar_items.head(num_recommendations) # com isso retorna os primeiros itens que são os mais semelhantes


In [35]:
recommend_items('Counter-Strike', item_similarity_df,10)

item_id
Containment The Zombie Puzzler           0.455379
Weird Worlds Return to Infinite Space    0.455379
Space Warp                               0.455379
Actual Sunlight                          0.455379
Overture                                 0.455185
Reprisal Universe                        0.455101
Gigantic Army                            0.419344
ProtoGalaxy                              0.417254
Orborun                                  0.413713
Counter-Strike Nexon Zombies             0.375760
Name: Counter-Strike, dtype: float64

In [39]:
recommend_items('Fallout 4', item_similarity_df,10)

item_id
Mega Man Legacy Collection          0.492852
Myst V                              0.435477
The Witcher 3 Wild Hunt             0.420982
Satellite Reign                     0.328512
Middle-earth Shadow of Mordor       0.318584
Fallout New Vegas                   0.313947
Overlord II                         0.312697
Sins of a Solar Empire Rebellion    0.302871
Gods Will Be Watching               0.302772
Zeno Clash 2                        0.301537
Name: Fallout 4, dtype: float64

###### A matriz de similiaridade é feita a partir da matriz de avaliações.
###### As recomendações são baseadas nos padrões de jogos de usuários. Jogos frequentemente jogados pelos mesmos usuários são considerados semelhantes.

NameError: name 'pd' is not defined

In [None]:
# Caminho do arquivo de dados
file_path = 'steam_data.csv'

# Crie uma instância da classe
recommender = RecommenderSystem(file_path)

# Salve a instância da classe
joblib.dump(recommender, 'recommender_system.pkl')
