## Dependencies

In [None]:
#data collection
import requests
import json
from math import ceil
from time import sleep, time
from datetime import datetime
import pandas as pd

#data cleaning and 
import numpy as np
import plotly.express as px
import networkx as nx

## Collecting the data; our own TwitterAPI Class

Create TwitterAPI class. Loop through list of bearer tokens, when rate limit of a single bearer token's API is reached.  
Takes list of bearer tokens as required inputs.  

Important methods:
- __Connection_to_enpoint:__ Using the request module to approach the twitter api.
- __Connection_to_enpoint_loop:__ Connect to endpoint but try a new bearer token if rate limit is reached.
- __Snowball:__ Given a twitter user (screenname) as seed retrieve a network of users. Output is a pandas dataframe.

The data collection is done entirely in the script but is easily importable using `from TwitterNetworkCollectionScript import TwitterAPI`


## Visulisation of the network

In [None]:
### READ DATA ###

df = pd.read_pickle('data/results15062021')

#### FORMAT DATA ###

df = pd.concat([df,df['public_metrics'].apply(pd.Series)], axis=1)
df_username = df.groupby('username')
df_username = df_username['name'].count().sort_values(ascending=False)
df = pd.merge(df,df_username,left_on='username',right_index = True, suffixes=('','_count')) 
df = df.rename(columns = {'name_count':'indegree'})

# Make a "wind" dummy 

df.loc[(df['description'].str.contains('(?i)Wind(?!ow)')) | 
            (df['username'].str.contains('(?i)Wind(?!ow)')), 'wind'] = 1

## Map the users

In [None]:
def gps_of_location(location, header = ''):
    google_API_key = AppCred.GOOGLE_API_KEY
    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={location}&key={google_API_key}"
    api = TwitterAPI(header)
    try: 
        json_response = api.connect_to_endpoint(url=url)
    except:
        return 'No GPS coordinates found'
    if json_response['status'] == 'OK':
        d = json_response['results'][0]['geometry']['location']
        d['types_google_api_response'] = json_response['results'][0]['types']
        return d
    elif json_response['status'] == 'REQUEST_DENIED':
        return json_response
    else:
        return 'No GPS coordinates found'

In [None]:
## Collect gps coordinates from locations

from TwitterNetworkCollectionScript import TwitterAPI

#Use the twitter api collect to endpoint function - not a clean way to do it - but fun
api = TwitterAPI('')

#remove special characters in location
df['location_trimmed'] = df['location']
locations = df_results.loc[df['wind'] == 1, 'location_trimmed'].apply(lambda x : re.sub('/[!@#$%^&*]/g', ' ', str(x)))
locations = locations.unique()

#get gps coordinates from the twitter users' own location description
gps = {x : gps_of_location(x) for x in locations}
#map to dataframe
df_gps = pd.DataFrame(gps).transpose()
#merge with original dataframe
df = pd.merge(df,df_gps, left_on = 'location', right_index = True, how = 'outer')

In [None]:
#construct map

#filter out response types that indicates the users location description is not really a location
filter_list = ['political','natural_feature']
df['geo_type_filter'] = df['types_google_api_response'] 
df['geo_type_filter'] = df['geo_type_filter'].apply(lambda x : any(y in x for y in filter_list) 
                                                                    if isinstance(x,list) else False)
df_map = df.loc[(df['wind'] == 1)&(df['geo_type_filter']==True)].drop_duplicates('username')

#rename columns for legend to be pretty
df_map['user_degree'] = df_map['ball_depth'].astype(int)
df_map['user_degree'] = df_map['user_degree'].apply(lambda x:str(x+1))

#construct the actual map
fig = px.scatter_geo(df_map,lat='lat', lon='lng',color='user_degree',hover_name='username')

#output
fig.write_image('map/static_twitter_user_map.jpeg')
fig.write_html("map/interactive_twitter_user_map.html")
fig.show()

### Create a network

In [None]:
number_of_seeds = len(df_results['followed_by'].unique())
number_of_nodes = len(df_results['username'].unique())
df_results_no_duplicates = df_results.drop_duplicates(['followed_by','username'])
number_of_edges = len(df_results_no_duplicates)
print(f'''__Network statistics__
Number of seeds used: {number_of_seeds}
Number of nodes: {number_of_nodes}
Number of edges in network: {number_of_edges}
''')

In [None]:
df_network = df_results.copy()
#Filter out non-wind
df_network = df_network.loc[df_network['wind']==1]

In [None]:
#create graph object
G = nx.from_pandas_edgelist(df_network ,source = 'followed_by', target = 'username', edge_attr = ['username','followed_by', 'ball_depth'] ,create_using = nx.DiGraph())
#create node attributes
nodes = df_network.drop_duplicates('username')
nodes = nodes.rename(columns={'username' : 'node'})
node_attributes = nodes.set_index('node').to_dict('index')
nx.set_node_attributes(G,node_attributes)
#write to gephi
nx.write_gexf(G, 'Graphs/WindWatchOrg_snowball_sample_v3_only_wind.gexf')