In [1]:
import pandas as pd
import re
import json
import os
import numpy as np
import matplotlib.pyplot as plt

### Load internet top lists

Alexa global top 1 Million websites

In [3]:
#Alexa top1M
df_alexa= pd.read_csv("./internet_top_lists/alexa-top-1m.txt", header = None)
df_alexa
# df_alexa[1] = df_alexa[0]
# df_alexa[0] = df_alexa.index + 1
#search by str.contain
 #  df_alexa[df_alexa[1].str.contains('poki')].head()
df_alexa.columns = ['Rank', 'Website']
df_alexa.head(5)

Unnamed: 0,Rank,Website
0,1,google.com
1,2,youtube.com
2,3,baidu.com
3,4,facebook.com
4,5,bilibili.com


Majestic global top 1 Million domains (based on subnets)

In [4]:
#Majestic top1M
df_majestic = pd.read_csv("./internet_top_lists/majestic_million.csv")
df_majestic.head()

Unnamed: 0,GlobalRank,TldRank,Domain,TLD,RefSubNets,RefIPs,IDN_Domain,IDN_TLD,PrevGlobalRank,PrevTldRank,PrevRefSubNets,PrevRefIPs
0,1,1,google.com,com,504710,2632872,google.com,com,1,1,505887,2644923
1,2,2,facebook.com,com,503645,2785660,facebook.com,com,2,2,504788,2797727
2,3,3,youtube.com,com,455709,2272575,youtube.com,com,3,3,456951,2283594
3,4,4,twitter.com,com,447261,2246848,twitter.com,com,4,4,448381,2256993
4,5,5,instagram.com,com,379059,1833615,instagram.com,com,5,5,380176,1842150


Cisco Umbrella global top 1 Million domains (based on DNS)

In [5]:
#Cisco Umbrella top 1M
df_umbrella = pd.read_csv("./internet_top_lists/cisco_umbrella_top1M.csv", header=None)
df_umbrella.columns = ['Rank', 'Domain']
df_umbrella.head()

Unnamed: 0,Rank,Domain
0,1,netflix.com
1,2,ftl.netflix.com
2,3,prod.ftl.netflix.com
3,4,api-global.netflix.com
4,5,ichnaea.netflix.com


Quantcast top 1M websites (US-ONLY!)

In [24]:
#Quantcast top 1M US
df_quantcast = pd.read_csv("./internet_top_lists/Quantcast-top1M-USonly.txt", sep='\t')
df_umbrella.columns = ['Rank', 'Website']
df_quantcast

Unnamed: 0,Rank,Site
0,1,google.com
1,2,facebook.com
2,3,amazon.com
3,4,youtube.com
4,5,twitter.com
...,...,...
467517,464839,zerowastenerd.com
467518,464839,zevyjoy.com
467519,464839,zombietechs.com
467520,464839,zonacooks.com


### Parse "io games" search results (json to DataFrame)

A score is added, based on ranking and prominence in search results

Every time a website is found in search results, **do:** $score = score + x$ 

where $x \in [0,1.0]$ (x=1 if website is the first search result, x=0 if it is the last one) 

In [7]:
#convert json search results in list of tuples 
def json_to_result_tuples(json_file, engine, lan="english"):
    results_from_json = []
    with open(json_file) as f:
        data = json.load(f)
    n_res = len(data)
    for tup in data:
        #extract website
        if tup['url'][0:8] == "https://":
            website = tup['url'].split("https://")[1].split("/")[0]
        elif tup['url'][0:7] == "http://":
            website = tup['url'].split("http://")[1].split("/")[0]
        else:
            continue

        #add tuple (add "score" value e.g., rank1-->100, rank2-->99, etc..)
        results_from_json.append ({'engine':engine,
                            'keywords': "io games",
                            'language':lan,
                            'website':website, 
                            'search_rank':tup['rank'],
                            'search_score': 1-tup['rank']/n_res}) #relative
    return results_from_json

#### Include one of the following: 

#### Case 1: Google only, top30 languages, 100 results per language

In [10]:
res_tuples = []

# include google, different languages, no proxy
f_list = %sx ls ./output/google*.json
for f_json in f_list:
    lan = f_json.split("google_")[1].split("_")[0]
    res_tuples.extend(json_to_result_tuples(f_json, "google", lan))
    
#create dataframe from result tuples
df_res = pd.DataFrame(res_tuples)

#website with top score
df_topscore = df_res.groupby("website")['search_score'].sum().sort_values(ascending=False)

df_res

Unnamed: 0,engine,keywords,language,website,search_rank,search_score
0,google,io games,arabic,www.pacogames.com,1,0.989899
1,google,io games,arabic,www.pacogames.com,2,0.979798
2,google,io games,arabic,suppsington.ml,3,0.969697
3,google,io games,arabic,www.dm.gov.ae,4,0.959596
4,google,io games,arabic,www.ar.obfog.com,5,0.949495
...,...,...,...,...,...,...
5371,google,io games,vietnamese,yeoldeenglishflyshop.com,95,0.040404
5372,google,io games,vietnamese,vi.aliexpress.com,96,0.030303
5373,google,io games,vietnamese,listfortmcmurray.com,97,0.020202
5374,google,io games,vietnamese,topgame.vn,98,0.010101


In [11]:
res_tuples = []

# include google, different languages, no proxy
f_list = %sx ls ./output/baidu*.json
for f_json in f_list:
    lan = f_json.split("baidu_")[1].split("_")[0]
    res_tuples.extend(json_to_result_tuples(f_json, "baidu", lan))
    
#create dataframe from result tuples
df_res = pd.DataFrame(res_tuples)

#website with top score
df_topscore = df_res.groupby("website")['search_score'].sum().sort_values(ascending=False)

df_res

Unnamed: 0,engine,keywords,language,website,search_rank,search_score
0,baidu,io games,arabic,www.100try.com,1,0.99
1,baidu,io games,arabic,tv.sohu.com,2,0.98
2,baidu,io games,arabic,www.chayiba.com,3,0.97
3,baidu,io games,arabic,www.iapolo.com,4,0.96
4,baidu,io games,arabic,www.iapolo.com,5,0.95
...,...,...,...,...,...,...
4980,baidu,io games,vietnamese,www.doc88.com,6,0.40
4981,baidu,io games,vietnamese,space.bilibili.com,7,0.30
4982,baidu,io games,vietnamese,cn.linguee.com,8,0.20
4983,baidu,io games,vietnamese,blog.51cto.com,9,0.10


In [12]:
res_tuples = []

# include google, different languages, no proxy
f_list = %sx ls ./output/ask.com*.json
for f_json in f_list:
    lan = f_json.split("ask.com_")[1].split("_")[0]
    res_tuples.extend(json_to_result_tuples(f_json, "ask.com", lan))
    
#create dataframe from result tuples
df_res = pd.DataFrame(res_tuples)

#website with top score
df_topscore = df_res.groupby("website")['search_score'].sum().sort_values(ascending=False)

df_res

Unnamed: 0,engine,keywords,language,website,search_rank,search_score
0,ask.com,io games,english,www.askmoney.com,1,0.989011
1,ask.com,io games,english,www.askmoney.com,2,0.978022
2,ask.com,io games,english,www.questionsanswered.net,3,0.967033
3,ask.com,io games,english,www.freeonlinegames.com,4,0.956044
4,ask.com,io games,english,www.arkadium.com,5,0.945055
...,...,...,...,...,...,...
222,ask.com,io games,telugu,websetnet.net,29,0.121212
223,ask.com,io games,telugu,www.ntnews.com,30,0.090909
224,ask.com,io games,telugu,www.actualidadgadget.com,31,0.060606
225,ask.com,io games,telugu,chrome.google.com,32,0.030303


In [13]:
res_tuples = []

# include google, different languages, no proxy
f_list = %sx ls ./output/bing*.json
for f_json in f_list:
    lan = f_json.split("bing_")[1].split("_")[0]
    res_tuples.extend(json_to_result_tuples(f_json, "bing", lan))
    
#create dataframe from result tuples
df_res = pd.DataFrame(res_tuples)

#website with top score
df_topscore = df_res.groupby("website")['search_score'].sum().sort_values(ascending=False)

df_res

Unnamed: 0,engine,keywords,language,website,search_rank,search_score
0,bing,io games,arabic,mawdoo3.com,1,0.99
1,bing,io games,arabic,sabq.org,2,0.98
2,bing,io games,arabic,www.facebook.com,3,0.97
3,bing,io games,arabic,healthawarenessalaa.blogspot.com,4,0.96
4,bing,io games,arabic,apkgk.com,5,0.95
...,...,...,...,...,...,...
16350,bing,io games,vietnamese,www.pinterest.ca,96,0.04
16351,bing,io games,vietnamese,www.microsoft.com,97,0.03
16352,bing,io games,vietnamese,vi.m.wikipedia.org,98,0.02
16353,bing,io games,vietnamese,vi.m.wikipedia.org,99,0.01


#### Case 2: Google only, English only, 423 results 

(manual search, as no more than 100 results can be requested by crafting the request url)

In [14]:
res_tuples = []
f_json = "./output/google_english_423__res.json"

lan = f_json.split("google_")[1].split("_")[0]
res_tuples.extend(json_to_result_tuples(f_json, "google", lan))
    
#create dataframe from result tuples
df_res = pd.DataFrame(res_tuples)

#website with top score
df_topscore = df_res.groupby("website")['search_score'].sum().sort_values(ascending=False)

df_res

Unnamed: 0,engine,keywords,language,website,search_rank,search_score
0,google,io games,english,poki.com,1,0.997636
1,google,io games,english,www.crazygames.com,2,0.995272
2,google,io games,english,www.kiloo.com,3,0.992908
3,google,io games,english,www.quora.com,4,0.990544
4,google,io games,english,www.io-games.io,5,0.988180
...,...,...,...,...,...,...
418,google,io games,english,grad.center,419,0.009456
419,google,io games,english,padsreds.com,420,0.007092
420,google,io games,english,mktabah.com,421,0.004728
421,google,io games,english,cacti.forestjournal.org,422,0.002364


#### Case 2: Google+Bing+Baidu+Yahoo, English only, query for 1000 res

In [15]:
res_tuples = []

#add google (manual search)
f_json = "./output/google_english_423__res.json"
res_tuples.extend(json_to_result_tuples(f_json, "google", "english"))

#add yahoo, bing, baidu
for engine in ["baidu", "bing", "yahoo"]:
    f_json = "./output/" + engine + "_english_1000__res.json"
    res_tuples.extend(json_to_result_tuples(f_json, engine, "english"))

#create dataframe from result tuples
df_res = pd.DataFrame(res_tuples)

#website with top score (based frequency+ranking in search results)
df_topscore = df_res.groupby("website")['search_score'].sum().sort_values(ascending=False)

df_res

Unnamed: 0,engine,keywords,language,website,search_rank,search_score
0,google,io games,english,poki.com,1,0.997636
1,google,io games,english,www.crazygames.com,2,0.995272
2,google,io games,english,www.kiloo.com,3,0.992908
3,google,io games,english,www.quora.com,4,0.990544
4,google,io games,english,www.io-games.io,5,0.988180
...,...,...,...,...,...,...
3033,yahoo,io games,english,teamthetagames.itch.io,836,0.007126
3034,yahoo,io games,english,vollkorn-games.itch.io,837,0.005938
3035,yahoo,io games,english,throughgames.itch.io,838,0.004751
3036,yahoo,io games,english,razek-games.itch.io,840,0.002375


#### Case 3: Google+Bing+Baidu+Yahoo, top30 languages, all available res in ./output

In [16]:
res_tuples = []

#more engines, different languages, and no proxy
f_list = %sx ls ./output/*.json
for f_json in f_list:
    engine = f_json.split('_')[0]
    lan = f_json.split('_')[1]
    res_tuples.extend(json_to_result_tuples(f_json, engine, lan))
    
#create dataframe from result tuples
df_res = pd.DataFrame(res_tuples)

#website with top score (based frequency+ranking in search results)
df_topscore = df_res.groupby("website")['search_score'].sum().sort_values(ascending=False)

df_res

Unnamed: 0,engine,keywords,language,website,search_rank,search_score
0,./output/ask.com,io games,english,www.askmoney.com,1,0.989011
1,./output/ask.com,io games,english,www.askmoney.com,2,0.978022
2,./output/ask.com,io games,english,www.questionsanswered.net,3,0.967033
3,./output/ask.com,io games,english,www.freeonlinegames.com,4,0.956044
4,./output/ask.com,io games,english,www.arkadium.com,5,0.945055
...,...,...,...,...,...,...
28721,./output/yahoo,io games,vietnamese,www.hellokids.com,73,0.231579
28722,./output/yahoo,io games,vietnamese,www.y8.net,75,0.210526
28723,./output/yahoo,io games,vietnamese,slitherio.vi.softonic.com,76,0.200000
28724,./output/yahoo,io games,vietnamese,bba-game-studio-shooting-games.vn.aptoide.com,94,0.010526


### Annotate  websites in google searches against alexa top1M

In [17]:
def get_alexa_rank(website):
    #some websites may or may not appear with "www."
    alexa_rank = None
    if website[0:4] == "www.":
        df_alexa_select = df_alexa[df_alexa["Website"] == (website.split("www.")[1])].values
        if len(df_alexa_select)>0:
            alexa_rank = df_alexa_select[0][0]
        else:
            df_alexa_select = df_alexa[df_alexa["Website"] == website].values
            if len(df_alexa_select)>0:
                alexa_rank = df_alexa_select[0][0]
    else:
        df_alexa_select = df_alexa[df_alexa["Website"]==website].values
        if len(df_alexa_select)>0:
            alexa_rank = df_alexa_select[0][0]
        else:
            df_alexa_select = df_alexa[df_alexa["Website"] == "www."+website].values
            if len(df_alexa_select)>0:
                alexa_rank = df_alexa_select[0][0]
    return alexa_rank

def get_umbrella_rank(domain):
    #some domains may or may not appear with "www." 
    umbrella_rank = None
    if domain[0:4] == "www.":
        df_umbrella_select = df_umbrella[df_umbrella["Domain"] == (domain.split("www.")[1])].values
        if len(df_umbrella_select)>0:
            umbrella_rank = df_umbrella_select[0][0]
        else:
            df_umbrella_select = df_umbrella[df_umbrella["Domain"] == domain].values
            if len(df_umbrella_select)>0:
                umbrella_rank = df_umbrella_select[0][0]
    else:
        df_umbrella_select = df_umbrella[df_umbrella["Domain"] == domain].values
        if len(df_umbrella_select)>0:
            umbrella_rank = df_umbrella_select[0][0]
        else:
            df_umbrella_select = df_umbrella[df_umbrella["Domain"] == "www."+domain].values
            if len(df_umbrella_select)>0:
                umbrella_rank = df_umbrella_select[0][0]
    return umbrella_rank

def get_quantcast_rank(site):
    #some sites may or may not appear with "www."
    quantcast_rank = None
    if site[0:4] == "www.":
        df_quantcast_select = df_quantcast[df_quantcast["Site"] == (site.split("www.")[1])].values
        if len(df_quantcast_select)>0:
            quantcast_rank = df_quantcast_select[0][0]
        else:
            df_quantcast_select = df_quantcast[df_quantcast["Site"] == site].values
            if len(df_quantcast_select)>0:
                quantcast_rank = df_quantcast_select[0][0]
    else:
        df_quantcast_select = df_quantcast[df_quantcast["Site"]==site].values
        if len(df_quantcast_select)>0:
            quantcast_rank = df_quantcast_select[0][0]
        else:
            df_quantcast_select = df_quantcast[df_quantcast["Site"] == "www."+site].values
            if len(df_quantcast_select)>0:
                quantcast_rank = df_quantcast_select[0][0]
    return quantcast_rank

def get_majestic_rank(domain):
    #some domains may or may not appear with "www."
    majestic_rank = None
    if domain[0:4] == "www.":
        df_majestic_select = df_majestic[df_majestic["Domain"] == (domain.split("www.")[1])]['GlobalRank'].values
        if len(df_majestic_select)>0:
            majestic_rank = df_majestic_select[0]
        else:
            df_majestic_select = df_majestic[df_majestic["Domain"] == domain]['GlobalRank'].values
            if len(df_majestic_select)>0:
                majestic_rank = df_majestic_select[0]
    else:
        df_majestic_select = df_majestic[df_majestic["Domain"]==domain]['GlobalRank'].values
        if len(df_majestic_select)>0:
            majestic_rank = df_majestic_select[0]
        else:
            df_majestic_select = df_majestic[df_majestic["Domain"] == "www."+domain]['GlobalRank'].values
            if len(df_majestic_select)>0:
                majestic_rank = df_majestic_select[0]
    return majestic_rank

Annotate ranking from internet top lists (**this might take a few minutes**)

In [18]:
website

NameError: name 'website' is not defined

In [19]:
combo_data = []

for website, score in zip(df_topscore.index, df_topscore):
    #each of these return nan if no match in the top1M list
    alexa_rank = get_alexa_rank(website)
    umbrella_rank = get_umbrella_rank(website)
    majestic_rank = get_majestic_rank(website)
    quantcast_rank = get_quantcast_rank(website)
    combo_data.append({'website':website, 
                       'search_score': score, 
                       'alexa_rank': alexa_rank, 
                       'majestic_rank': majestic_rank, 
                       'umbrella_rank': umbrella_rank , 
                       'quantcast_rank': quantcast_rank})
#     plt.scatter(alexa_rank, score)
# plt.xlabel("Alexa rank")
# plt.ylabel("search score)")
# plt.xscale("log")
# plt.yscale("log")
# plt.show()

#combo search_score + internet toplist ranking 
df_combo = pd.DataFrame(combo_data)

KeyError: 'Domain'

In [20]:
df_combo['is_marketplace'] = 0
df_combo['is_iogame'] = 0
df_combo.sort_values('alexa_rank').to_csv("top_results_new.csv",)

NameError: name 'df_combo' is not defined

#### Define a set of "prominent" websites based on (i) alexa rank and/or (ii) frequency+ranking in search results
 
##### TODO: discuss criteria 

In [21]:
#search score + alexa ranking


#e.g., select based on some thresholds on rank and score
# df_combo = df_combo[(df_combo['search_score']>10) & (df_combo['alexa_rank']<10000)].sort_values('alexa_rank')

#e.g., select based on alexa rank only
# df_combo = df_combo[(df_combo['alexa_rank']<1e5)].sort_values('alexa_rank')

#e.g., select all, order by alexa rank
df_combo = df_combo.sort_values('alexa_rank')
print(len(df_combo))
#Manual checking of websites to discover io games marketplaces can start from here:
print("Website Alexa_rank")
for x,y in df_combo.iterrows():
    print(y['website'], y['alexa_rank'])

NameError: name 'df_combo' is not defined

In [22]:
import requests
"""
old, replaced 
"""
#GOOGLE
r = requests.get('https://www.crazygames.com/c/io', auth=('user', 'pass'))
print(r.text)

http_urls = re.findall('http://.+?/',r.text)
http_urls = [x for x in http_urls if (x[-2].isalpha() or x[-2].isdigit()) and ' ' not in x]

https_urls = re.findall('https://.+?/',r.text)
https_urls = [x for x in https_urls if (x[-2].isalpha() or x[-2].isdigit()) and ' ' not in x]

urls = list(set().union(http_urls, https_urls)) 
print("%d urls collected" % len(urls))
urls

#https://edmundmartin.com/scraping-baidu-with-python/

<!DOCTYPE html><html lang="en" dir="ltr"><head><meta charSet="utf-8" /><meta name="viewport" content="user-scalable=no, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, width=device-width, height=device-height" /><script async="" src="https://www.googletagmanager.com/gtag/js?id=AW-312835820"></script><script>
  window.dataLayer = window.dataLayer || [];
  function gtag() {dataLayer.push(arguments)};
  </script><title>.io Games - Play .io Games on CrazyGames</title><meta http-equiv="Accept-CH" content="DPR" /><meta name="description" content="We collected 412 of the best free online .io games. These games include browser games for both your computer and mobile devices, as well as apps for your Android and iOS phones and tablets. They include new .io games such as  and top .io games such as Shell Shockers, Smash Karts, and Rocket Bot Royale." /><link rel="canonical" href="https://www.crazygames.com/c/io" /><link rel="alternate" hrefLang="nl" href="https://www.crazygames.nl/catego

['https://www.crazygames.com.br/',
 'http://www.crazygames.com/',
 'https://www.crazygames.com/',
 'https://www.crazygames.hu/',
 'https://www.crazygames.fr/',
 'https://fonts.googleapis.com/',
 'https://www.crazygames.fi/',
 'https://fonts.gstatic.com/',
 'https://workers.crazygames.com/',
 'https://builds.crazygames.com/',
 'https://videos.crazygames.com/',
 'https://www.crazygames.no/',
 'https://www.googletagmanager.com/',
 'https://de.crazygames.com/',
 'https://www.crazygames.dk/',
 'https://www.crazygames.co.id/',
 'https://gr.crazygames.com/',
 'https://files.crazygames.com/',
 'https://www.crazygames.nl/',
 'https://www.googletagservices.com/',
 'https://th.crazygames.com/',
 'https://cdn.iubenda.com/',
 'https://ar.crazygames.com/',
 'http://www.w3.org/',
 'https://www.1001juegos.com/',
 'https://www.crazygames.ru/',
 'https://www.crazygames.cz/',
 'https://www.crazygames.com.ua/',
 'https://www.crazygames.ro/',
 'https://www.crazygames.com.vn/',
 'https://c2b85208be5148c1a11