### Country analysis
* Each domain's country is determined by IP lookup

In [1]:
import pandas as pd
import pycountry_convert as pc
import socket
from geolite2 import geolite2
from collections import Counter
import os

In [2]:
def origin(ip, domain_str, result):
    print("{0} [{1}]: {2}".format(domain_str.strip(), ip, result))

def getip(domain_str):
    ip = socket.gethostbyname(domain_str.strip())
    reader = geolite2.reader()      
    output = reader.get(ip)
    if output is None:
        raise Exception('no DB output')
    country_key = 'country' if 'country' in output.keys() else 'registered_country'
    if not country_key in output.keys():
        raise Exception('no country code')
    country = output[country_key]['iso_code']
    return ip, country

def getAllIPs(urls):
    results = {}
    for domain_str in urls:
        try:
            # print(domain_str)
            ip, country = getip(domain_str)
            results[domain_str] = (ip, country) 
        except socket.error as msg:
            print("{0} [could not resolve]".format(domain_str.strip())) 
            if len(domain_str) > 2:
                subdomain = domain_str.split('.', 1)[1]
                try:
                    ip, country = getip(subdomain)
                    results[domain_str] = (ip, country) 
                except:
                    # print(domain_str, ', ', subdomain)
                    continue
        except:
            # print('Fail: ', domain_str)
            continue
    return results

def country_to_continent(country_alpha2):
    # country_alpha2 = pc.country_name_to_country_alpha2(country_name)
    if country_alpha2 == 'VA':
        return 'Europe'
    country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    return country_continent_name

In [3]:
if not os.path.exists('country_discovered_misinfo.csv'):
    ls_misinfo_df = pd.read_csv('../data/discovered_domains.csv')
    misinfo_tlds = [domain.split('.')[-1] for domain in ls_misinfo_df['url']]
    print(Counter(misinfo_tlds).most_common())
    results = getAllIPs(ls_misinfo_df['url'])
    country_df = pd.DataFrame({'url':results.keys(), 'country':[c for (ip, c) in results.values()]})
    country_df.to_csv('country_discovered_misinfo.csv', index=False)
    print(country_df['country'].value_counts())
new = pd.read_csv('country_discovered_misinfo.csv')

if not os.path.exists('country_orig_misinfo.csv'):
    df_filtered = pd.read_csv('../data/filtered_attrs.csv')
    original_link_results = getAllIPs(df_filtered['url'])
    orig_country_df = pd.DataFrame({'url':original_link_results.keys(), 'country':[c for (ip, c) in original_link_results.values()]})
    orig_country_df.to_csv('country_orig_misinfo.csv', index=False)
    print(orig_country_df['country'].value_counts())
orig = pd.read_csv('country_orig_misinfo.csv')

if not os.path.exists('country_ls_outlinks.csv'):
    ls_outlinks = pd.read_csv('../data/link_scheme_outlink_attrs.csv')
    ls_link_results = getAllIPs(ls_outlinks['url'])
    ls_country_df = pd.DataFrame({'url':ls_link_results.keys(), 'country':[c for (ip, c) in ls_link_results.values()]})
    ls_country_df.to_csv('country_ls_outlinks.csv', index=False)
    print(ls_country_df['country'].value_counts())
links = pd.read_csv('country_ls_outlinks.csv')

if not os.path.exists('country_scheme.csv'):
    scheme_results = getAllIPs(pd.read_csv('../analysis/schemes.csv')['url'])
    country_scheme_df = pd.DataFrame({'url':scheme_results.keys(), 'country':[c for (ip, c) in scheme_results.values()]})
    country_scheme_df.to_csv('country_scheme.csv', index=False)
    print(country_scheme_df['country'].value_counts())
schemes = pd.read_csv('country_scheme.csv')

In [4]:
new['continent'] = [country_to_continent(c) for c in new['country']]
orig['continent'] = [country_to_continent(c) for c in orig['country']]
links['continent'] = [country_to_continent(c) for c in links['country']]
schemes['continent'] = [country_to_continent(c) for c in schemes['country']]

In [5]:
us = 'North America'
print('Step 1. US original list %: ',orig['continent'].value_counts()[us]/orig.count()['url'])
print('Step 2. US link scheme %: ',schemes['continent'].value_counts()[us]/schemes.count()['url'])
print('Step 3. US link scheme outlinks %: ',links['continent'].value_counts()[us]/links.count()['url'])
print('Step 4. US new list %: ',new['continent'].value_counts()[us]/new.count()['url'])

Step 1. US original list %:  0.8857142857142857
Step 2. US link scheme %:  0.8109243697478992
Step 3. US link scheme outlinks %:  0.7853332127678814
Step 4. US new list %:  0.8132250580046404


In [6]:
new.groupby('continent').describe() # 81%

Unnamed: 0_level_0,url,url,url,url,country,country,country,country
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Africa,1,1,mfs-theothernews.com,1,1,1,ZA,1
Asia,32,32,semangatnews.com,1,32,9,TR,7
Europe,271,271,mintpressnews.ru,1,271,25,DE,84
North America,1402,1402,efinews.blogspot.com,1,1402,4,US,1380
Oceania,18,18,newsdigitalmedia.com.au,1,18,2,AU,17


In [7]:
orig.groupby('continent').describe() # 89%

Unnamed: 0_level_0,url,url,url,url,country,country,country,country
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Asia,47,47,aa.com.tr,1,47,12,SG,11
Europe,301,301,24ur.com,1,301,23,SE,101
North America,2759,2759,100percentfedup.com,1,2759,3,US,2710
Oceania,7,7,abc.net.au,1,7,1,AU,7
South America,1,1,telesurtv.net,1,1,1,AR,1


In [8]:
links.groupby('continent').describe() # 79%

Unnamed: 0_level_0,url,url,url,url,country,country,country,country
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Africa,10,10,von.gov.ng,1,10,3,ZA,8
Asia,537,537,theborneopost.com,1,537,24,IN,164
Europe,1709,1709,atr.org,1,1709,35,DE,351
North America,8685,8685,wnd.com,1,8685,9,US,8470
Oceania,86,86,greatclimatedebate.com,1,86,2,AU,83
South America,32,32,lanacion.com.ar,1,32,6,BR,11


In [9]:
schemes.groupby('continent').describe() # 79%

Unnamed: 0_level_0,url,url,url,url,country,country,country,country
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Africa,1,1,business.go.tz,1,1,1,TZ,1
Asia,10,10,iasri.res.in,1,10,5,IN,6
Europe,31,31,oxfordeconomics.com,1,31,10,RU,6
North America,193,193,lgstarr.blogspot.com,1,193,2,US,183
Oceania,2,2,bigpulpit.com,1,2,1,AU,2
South America,1,1,comtur.cl,1,1,1,CL,1
