In [None]:
import os
import time
import json 
import pandas as pd
import numpy as np
from tqdm import tqdm
from tweepy import Cursor, TweepError, OAuthHandler, API
from typing import List, Dict
from collections import defaultdict

In [None]:
# connect to Twitter API
consumer_key = 'CONSUMER_KEY'
secret_key = 'SECRET_KEY'
access_token = 'ACCESS_TOKEN'
access_token_secret = 'ACCESS_TOKEN_SECRET'
auth = OAuthHandler(consumer_key, secret_key)
auth.set_access_token(access_token, access_token_secret)

api = API(auth, wait_on_rate_limit=True,
          wait_on_rate_limit_notify=True, compression=True)

In [None]:
def read_lines(path: str):
    file = open(path, 'r')
    content = file.read().splitlines()
    file.close()
    if not content:
        raise Exception(f'file: {path} is empty.')
    return content

In [None]:
# load seed
screen_names = read_lines('seed_usernames.txt')
df = pd.read_csv('seed_ids.csv')
seed = df['id']
if len(seed) < len(screen_names):
    missing = set(df['screen_name']) ^ set(screen_names)
    print(f'Found {len(missing)} missing user id(s).')
    new_users = ''
    for m in tqdm(missing, total=len(missing), ncols=80):
        print(f'Searching for {m}...')
        user = api.get_user(m)
        new_users += f'{m},{user.id_str}\n'
    with open('seed_ids.csv', 'a') as f:
        f.write(new_users)
    print('Done.')
    df = pd.read_csv('seed_ids.csv')
    seed = df['id']
seed = list(seed)
print(f'Loaded {len(seed)} seed users')

In [None]:
def dataset_info(path='tmp/dataset.csv'):
    if os.path.exists(path):
        nodes = set()
        df = pd.read_csv(path)
        nodes = set(df.source.values)
        nodes.update(df.target.values)
        print(f'Number of nodes: {len(nodes)}')
        print(f'Number of edges: {len(df)}')
    else:
        print('No data found.')

In [None]:
def friendships_already_saved(user_id:int, friendship:str, path:str) -> bool:
    if not os.path.exists(path):
        return False
    df = pd.read_csv(path)
    column = df.source if friendship == 'followees' else df.target
    return user_id in column.values

In [None]:
def save_friendships(user_id:int, ids:List[int], friendship:str, path:str) -> None:
    reduced = ''
    if friendship == 'followees':
        for i in ids:
            reduced += f'{user_id},{i}\n'
    elif friendship == 'followers':
        for i in ids:
            reduced += f'{i},{user_id}\n'
    else:
        return
    file_exists = os.path.exists(path)
    with open(path, 'a') as f:
        if not file_exists:
            f.write('source,target\n')
        f.write(reduced)

In [None]:
def retrieve_friendships(user_id:int, friendship:str, limit:int, save_path:str):
    try:
        f = api.friends_ids if friendship == 'followees' else api.followers_ids
        ids = list(Cursor(f, id=user_id).items(limit))
        if not friendships_already_saved(user_id, friendship, save_path):
            save_friendships(user_id, ids, friendship, path=save_path)
    except TweepError as e:
        pass

In [None]:
def retrieve_friendships_from_file(file_path:str, save_path:str, column:str, friendship:str, limit:int) -> None:
    if not os.path.exists(file_path):
        print('No file found.')
        return
    df = pd.read_csv(file_path)
    user_ids = df[column].unique()
    idx = np.random.randint(len(user_ids), size=limit)
    user_ids = user_ids[idx]
    for user_id in tqdm(user_ids, total=len(user_ids), desc=f'{os.path.basename(file_path)}[{column}] {friendship}'):
        retrieve_friendships(user_id, friendship, limit, save_path)
        time.sleep(20)
    print('\nDone!')

In [None]:
# get seed followees
for s in tqdm(seed, total=len(seed), desc='Followees'):
    retrieve_friendships(s, 'followees', 100, 'tmp/followees/0.csv')
    time.sleep(20)

In [None]:
# get depth 1 followees
source = 'tmp/followees/0.csv'
target = 'tmp/followees/1.csv'
retrieve_friendships_from_file(source, target, column='target', friendship='followees', limit=100)

In [None]:
# get depth 2 followees
source = 'tmp/followees/1.csv'
target = 'tmp/followees/2.csv'
retrieve_friendships_from_file(source, target, column='target', friendship='followees', limit=100)

In [None]:
# get seed followers
for s in tqdm(seed, total=len(seed), desc='Followers'):
    retrieve_friendships(s, 'followers', 100, 'tmp/followers/0.csv')
    time.sleep(20)

In [None]:
# get depth 1 followers
source = 'tmp/followers/0.csv'
target = 'tmp/followers/1.csv'
retrieve_friendships_from_file(source, target, column='source', friendship='followers', limit=100)

In [None]:
# get depth 2 followers
source = 'tmp/followers/1.csv'
target = 'tmp/followers/2.csv'
retrieve_friendships_from_file(source, target, column='source', friendship='followers', limit=100)

In [None]:
df0 = pd.read_csv('tmp/followees/0.csv')
df1 = pd.read_csv('tmp/followees/1.csv')
df2 = pd.read_csv('tmp/followees/2.csv')
df = pd.concat([df0, df1, df2])
df.to_csv('tmp/followees/followees.csv', index=False)
len(df)

In [None]:
df0 = pd.read_csv('tmp/followers/0.csv')
df1 = pd.read_csv('tmp/followers/1.csv')
df2 = pd.read_csv('tmp/followers/2.csv')
df = pd.concat([df0, df1, df2])
df.to_csv('tmp/followers/followers.csv', index=False)
len(df)

In [None]:
followees_df = pd.read_csv('tmp/followees/followees.csv')
followers_df = pd.read_csv('tmp/followers/followers.csv')
df = pd.concat([followees_df, followers_df])
df.to_csv('tmp/dataset.csv', index=False)
len(df)

In [None]:
dataset_info('tmp/dataset.csv')

In [None]:
def retrieve_user(user_id: int) -> Dict:
    try:
        user = api.get_user(user_id)
        entities = user.entities
        urls = []
        try:
            for e in entities.values():
                for url in e['urls']:
                    urls.append(url['display_url'])
        except Exception:
           pass
        user_data = {
            'id': user.id,
            'name': user.name,
            'screen_name': user.screen_name,
            'location': user.location,
            'description': user.description,
            'urls': urls,
            'protected': user.protected,
            'verified': user.verified,
            'followers_count': user.followers_count,
            'friends_count': user.friends_count,
            'listed_count': user.listed_count,
            'statuses_count': user.statuses_count,
            'default_profile': user.default_profile,
            'default_profile_image': user.default_profile_image
        }
        return user_data
    except TweepError as e:
        print(e.reason)
    return None

In [None]:
def get_unique_users(path='tmp/dataset.csv'):
    df = pd.read_csv(path)
    users = set(df.source.unique())
    users.update(df.target.unique())
    return users

In [None]:
def get_missing_users(cache: Dict, dataset_path='tmp/dataset.csv'):
    if not os.path.exists(dataset_path):
        print('No dataset found.')
        return
    users = set()
    df = pd.read_csv(dataset_path)
    users.update(df.source.values)
    users.update(df.target.values)
    return users - set([int(key) for key in cache.keys()])

In [None]:
def update_user_cache(user_cache: Dict, file_path='tmp/users.json'):
    with open(file_path,'r+') as f:
        cache = json.load(f)
        cache.update(user_cache)
        f.seek(0)
        json.dump(cache, f, indent=2)

In [None]:
# retrieve user data
cache_path = 'tmp/users.json'
user_cache = {}
if os.path.exists(cache_path):
    with open(cache_path, 'r') as f:
        user_cache = json.load(f)

users = get_missing_users(user_cache, 'tmp/dataset.csv')
not_found = set()

del user_cache
user_cache = {}

if not users:
    print('No users to retrieve.')
else:
    for i, user_id in tqdm(enumerate(users, start=1), total=len(users), ncols=80):
        user = retrieve_user(user_id)
        if user:
            user_cache[str(user_id)] = user
        else:
            not_found.add(user_id)
        if i % 10 == 0:
            update_user_cache(user_cache, cache_path)
    update_user_cache(user_cache, cache_path)
    print('Done.')

In [None]:
# filter out users with missing data
df = pd.read_csv('tmp/dataset.csv')
for user_id in not_found:
    indexNames = df[(df.source == user_id) | (df.target == user_id)].index
    df.drop(indexNames, inplace=True)
df.to_csv('tmp/dataset.csv', index=False)

In [None]:
# set user indices
users = get_unique_users(path='tmp/dataset_demo_5000.csv') # here

idx = {}
for i, user_id in tqdm(enumerate(users), total=len(users)):
    idx[str(user_id)] = i

with open('tmp/user_idx_demo_5000.json', 'w') as f:
    json.dump(idx, f, indent=2)

In [None]:
# build edge list
with open('tmp/user_idx_demo_5000.json', 'r') as f:
    idx = json.load(f)

edge_list = pd.read_csv('tmp/dataset_demo_5000.csv').to_numpy()
for edge in tqdm(edge_list, total=len(edge_list)):
    for i, user_id in enumerate(edge):
        edge[i] = idx[str(user_id)]

df = pd.DataFrame(edge_list, columns=['source', 'target'])
df.to_csv('data/edges_demo_5000.csv', index=False)
print('\nDone!')

In [None]:
import tldextract
from ttp import ttp
import tldextract
from tldextract.tldextract import ExtractResult

parser = ttp.Parser()

In [None]:
# load research fields
global research_fields
research_fields = read_lines('research_fields.txt')
research_fields[:5]

In [None]:
# url verifications
def is_google_research_url(url: ExtractResult) -> bool:
    return url.subdomain in ['sites', 'scholar'] and url.domain == 'google'

def is_linkedin_url(url: ExtractResult) -> bool:
    return url.domain == 'linkedin'

def is_github_url(url: ExtractResult) -> bool:
    return url.domain == 'github'

def url_has_edu(url: ExtractResult) -> bool:
    return url.suffix.startswith('edu')

def url_is_academic(url: ExtractResult) -> bool:
    return url.suffix.startswith('ac')

def url_has_tilde(url: str) -> bool:
    return '~' in url

def verify_urls(urls: List[str]) -> List[int]:
    res = np.zeros(7, dtype=np.int8)
    for url in urls:
        tmp = np.zeros(7, dtype=np.int8)
        ext = tldextract.extract(url)
        tmp[0] = 1
        tmp[1] = is_google_research_url(ext)
        tmp[2] = is_linkedin_url(ext)
        tmp[3] = is_github_url(ext)
        tmp[4] = url_has_edu(ext)
        tmp[5] = url_is_academic(ext)
        tmp[6] = url_has_tilde(url)
        res = res | tmp
    return list(res), any(res[[1, 4, 5]])

In [None]:
def get_description_research_fields_count(desc: str) -> bool:
    desc = desc.lower()
    return sum(field in desc or ''.join(field.split()) in desc for field in research_fields)

In [None]:
# normalization
def min_max_norm(df:pd.DataFrame, column:str):
    df_min_max_scaled = df.copy()
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())    
    return df_min_max_scaled

def z_score_norm(df:pd.DataFrame, column:str):
    df_z_scaled = df.copy()
    df_z_scaled[column] = (df_z_scaled[column] - df_z_scaled[column].mean()) / df_z_scaled[column].std()
    return df_z_scaled

def std_norm(df:pd.DataFrame, column:str):
    df_std_scaled = df.copy()
    df_std_scaled[column] = df_std_scaled[column] / df_std_scaled[column].std()
    return df_std_scaled

In [None]:
df = pd.read_csv('data/features_demo_5000.csv')

df = min_max_norm(df, 'followers_count')
df = min_max_norm(df, 'friends_count')
df = min_max_norm(df, 'listed_count')
df = min_max_norm(df, 'statuses_count')

df = min_max_norm(df, 'ratio_followers_friends')
df = min_max_norm(df, 'description_research_fields_count')
df = min_max_norm(df, 'description_mention_count')
df = min_max_norm(df, 'description_hashtag_count')
df = min_max_norm(df, 'description_url_count')
# df.drop(['ratio_followers_friends'], axis = 1, inplace=True)


df.to_csv('data/features_demo_5000.csv', index=False)

In [None]:
def generate_feature_vector(user: Dict) -> List :
    features = []
    features.append(int(user['protected']))
    features.append(int(user['verified']))
    followers = user['followers_count']
    friends = user['friends_count']
    features.append(followers)
    features.append(friends)
    features.append(followers/friends if friends else 0)
    features.append(user['listed_count'])
    features.append(user['statuses_count'])
    features.append(int(user['default_profile']))
    features.append(int(user['default_profile_image']))
    urls = user['urls']
    verified_urls, has_research_url = verify_urls(urls)
    features = [*features, *verified_urls]
    features.append(int(bool(user['location'])))
    desc = user['description']
    research_fields_count = get_description_research_fields_count(desc)
    parsed_desc = parser.parse(desc)
    mentions = parsed_desc.users
    hashtags = parsed_desc.tags
    urls = parsed_desc.urls
    features.append(int(bool(desc)))
    features.append(research_fields_count)
    features.append(len(mentions))
    features.append(len(hashtags))
    features.append(len(urls))
    features.append(int(research_fields_count > 0 or has_research_url))
    return features

In [None]:
# generate user features
users = get_unique_users(path='tmp/dataset_demo_5000.csv')

with open('tmp/users.json', 'r') as f:
    user_data = json.load(f)

with open('tmp/user_idx_demo_5000.json', 'r') as f:
    idx = json.load(f)

user_features = []
for user_id in tqdm(users, total=len(users)):
    user = user_data[str(user_id)]
    features = generate_feature_vector(user)
    features = [idx[str(user_id)], *features]
    user_features.append(features)

columns = [
    'id',
    'protected',
    'verified',
    'followers_count',
    'friends_count',
    'ratio_followers_friends',
    'listed_count',
    'statuses_count',
    'default_profile',
    'default_profile_image',
    'has_url',
    'url_is_google_research',
    'url_is_linkedin',
    'url_is_github',
    'url_has_edu',
    'url_is_academic',
    'url_has_tilde',
    'has_location',
    'has_description',
    'description_research_fields_count',
    'description_mention_count',
    'description_hashtag_count',
    'description_url_count',
    'researcher'
]

df = pd.DataFrame.from_records(user_features, columns=columns)

df = std_norm(df, 'followers_count')
df = std_norm(df, 'friends_count')
df = std_norm(df, 'listed_count')
df = std_norm(df, 'statuses_count')
df = std_norm(df, 'ratio_followers_friends')
df = std_norm(df, 'description_research_fields_count')
df = std_norm(df, 'description_mention_count')
df = std_norm(df, 'description_hashtag_count')
df = std_norm(df, 'description_url_count')

df.to_csv('data/features_demo_5000.csv', index=False)
df.head()

In [None]:
# number of researchers found
df = pd.read_csv('data/features_1.5.csv')
count = len(df[df.researcher == 1])
print(f'Found {count}/{len(df)}(~{int(count/len(df)*100)}%) researchers.')