In [2]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import trange, tqdm
import concurrent.futures as cf
import time

In [5]:
def request(url):
    headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
    response  = requests.get(url, headers = headers)
    return BeautifulSoup(response.text, 'html.parser')

In [6]:
def get_nb_pages(url) :

    soup = request(url)

    num_lp = 1
    if soup.find('div', class_='pages') != None:
        last_page = soup.find('div', class_='pages').find('li', class_='last_page').find('a').text
        num_lp = int(last_page)
    return num_lp

In [7]:
def get_game_reviews(url):
    url = url + '/user-reviews'
    num_lp = get_nb_pages(url)

    review_dict = {'name':[]}

    for page in tqdm(range(0,num_lp)): 

        url2 = url + '?page=' + str(page)
        soup = request(url2)

        for rv in soup.find_all('div', class_='review_content'):
            if rv.find('div', class_='name') == None:
                           break
            if rv.find('div', class_='name').find('a') == None:
                           break      
            review_dict['name'].append(rv.find('div', class_='name').find('a').text)

    return pd.DataFrame(review_dict)  

In [8]:
def get_review_user(user):
    
    user_dict = {'name':[], 'rating':[]}
    url = 'https://www.metacritic.com/user/' + str(user)
    num_lp = get_nb_pages(url)
    for i in range(0,num_lp):
        url2 = url +'?myscore-filter=Game&page=' + str(i)
        soup = request(url2)
        for review in soup.find_all('div', class_='review_content'):
            if review.find('div', class_='product_title') == None:
                break 
            user_dict['name'].append(review.find('div', class_='product_title').find('a').text)
            user_dict['rating'].append(review.find('div', class_='review_score').find_all('div')[0].text)

    return pd.DataFrame(user_dict)

In [9]:
profils = {'name':[], 'games':[]}
def get_profil(user):
    profils['name'].append(user)
    t = get_review_user(user)
    profils['games'].append(t)


In [35]:
url = 'https://www.metacritic.com/game/playstation-4/the-witcher-3-wild-hunt'
review = get_game_reviews(url)

  0%|          | 0/23 [00:00<?, ?it/s]

In [36]:
review

Unnamed: 0,name
0,jakeman25
1,NeOmega
2,ChiLocc
3,RecoveryRx
4,taress
...,...
2289,VeryGoodReviews
2290,IDDQYU
2291,Goji_54
2292,SauceBox6


In [37]:
# Filtre pseudos déjà présents
data = pd.read_csv("data/data.csv")
review = review.loc[~review['name'].isin(data['name'])]


In [38]:
review

Unnamed: 0,name
2,ChiLocc
3,RecoveryRx
4,taress
5,adorian89
7,darthsouheil
...,...
2283,Thosevelvetygms
2284,tonyhank232
2286,jorge7ejea
2290,IDDQYU


In [39]:
start = time.time()

with cf.ThreadPoolExecutor(max_workers=2) as executor:
    executor.map(get_profil, review.name)        
    
end = time.time()
print(end - start)

1198.6308524608612


In [40]:
df = pd.DataFrame(columns=['name', 'title', 'rating'])

In [41]:
for i in tqdm(range(len(profils['name']))):
    for j in range(len(profils['games'][i])):
        df.loc[len(df)] = [profils['name'][i], profils['games'][i].name[j], profils['games'][i].rating[j]]

  0%|          | 0/3096 [00:00<?, ?it/s]

In [42]:
df

Unnamed: 0,name,title,rating
0,osmondxboxlive,The Last of Us Remastered,4
1,Nataraja,The Last of Us Remastered,10
2,Juankal,The Last of Us Remastered,10
3,Juankal,Grand Theft Auto V,10
4,Stefke,The Last of Us Remastered,10
...,...,...,...
25223,tonyhank232,Far Cry 4,9
25224,jorge7ejea,The Witcher 3: Wild Hunt,0
25225,IDDQYU,The Witcher 3: Wild Hunt,10
25226,Estev0n,The Witcher 3: Wild Hunt,10


In [43]:
df = df.drop_duplicates()

In [44]:
df

Unnamed: 0,name,title,rating
0,osmondxboxlive,The Last of Us Remastered,4
1,Nataraja,The Last of Us Remastered,10
2,Juankal,The Last of Us Remastered,10
3,Juankal,Grand Theft Auto V,10
4,Stefke,The Last of Us Remastered,10
...,...,...,...
25223,tonyhank232,Far Cry 4,9
25224,jorge7ejea,The Witcher 3: Wild Hunt,0
25225,IDDQYU,The Witcher 3: Wild Hunt,10
25226,Estev0n,The Witcher 3: Wild Hunt,10


In [45]:
df.to_csv('data/data.csv', mode='a', header=False, index=False)

In [14]:
import json
# Request to RAWG to get datas
r = []
for i in tqdm(range(1,3),desc="Extraction"):
    x = requests.get(f"https://api.rawg.io/api/games?key=62015356d5514b0c9cac8884f64d643a&page_size=40&page={i}&ordering=released")
    while(x.status_code != 200):
        print("Problème, on recommence")
        x = requests.get(f"https://api.rawg.io/api/games?key=62015356d5514b0c9cac8884f64d643a&page_size=40&page={i}&ordering=released")
    r += x.json()["results"]

Extraction:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
r

[{'id': 892902,
  'slug': 'armored-core-vi-fires-of-rubicon',
  'name': 'ARMORED CORE VI FIRES OF RUBICON',
  'released': None,
  'tba': True,
  'background_image': 'https://media.rawg.io/media/games/b59/b59f7aa35d656cb35f8fe534878ef536.jpg',
  'rating': 0.0,
  'rating_top': 0,
  'ratings': [],
  'ratings_count': 0,
  'reviews_text_count': 0,
  'added': 1,
  'added_by_status': {'toplay': 1},
  'metacritic': None,
  'playtime': 0,
  'suggestions_count': 329,
  'updated': '2022-12-10T20:47:25',
  'user_game': None,
  'reviews_count': 0,
  'community_rating': 0,
  'saturated_color': '0f0f0f',
  'dominant_color': '0f0f0f',
  'platforms': [{'platform': {'id': 186,
     'name': 'Xbox Series S/X',
     'slug': 'xbox-series-x',
     'image': None,
     'year_end': None,
     'year_start': 2020,
     'games_count': 678,
     'image_background': 'https://media.rawg.io/media/games/dcb/dcbb67f371a9a28ea38ffd73ee0f53f3.jpg'},
    'released_at': None,
    'requirements_en': None,
    'requirements_r

In [16]:
json.dumps(r)

'[{"id": 892902, "slug": "armored-core-vi-fires-of-rubicon", "name": "ARMORED CORE VI FIRES OF RUBICON", "released": null, "tba": true, "background_image": "https://media.rawg.io/media/games/b59/b59f7aa35d656cb35f8fe534878ef536.jpg", "rating": 0.0, "rating_top": 0, "ratings": [], "ratings_count": 0, "reviews_text_count": 0, "added": 1, "added_by_status": {"toplay": 1}, "metacritic": null, "playtime": 0, "suggestions_count": 329, "updated": "2022-12-10T20:47:25", "user_game": null, "reviews_count": 0, "community_rating": 0, "saturated_color": "0f0f0f", "dominant_color": "0f0f0f", "platforms": [{"platform": {"id": 186, "name": "Xbox Series S/X", "slug": "xbox-series-x", "image": null, "year_end": null, "year_start": 2020, "games_count": 678, "image_background": "https://media.rawg.io/media/games/dcb/dcbb67f371a9a28ea38ffd73ee0f53f3.jpg"}, "released_at": null, "requirements_en": null, "requirements_ru": null}, {"platform": {"id": 1, "name": "Xbox One", "slug": "xbox-one", "image": null, "