# Laboratory 1: setting up Twitter API

## Set up

In [None]:
import os
os.environ['TOKEN'] = #CENSORED

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Import libraries

In [None]:
import requests 
import pandas as pd 
import time

### Set up headers

In [None]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [None]:
headers = create_headers(os.environ['TOKEN'])

## Download data

In [None]:
def create_url(keyword, start_date, end_date, env_label, endpoint="fullarchive"):
    
    search_url = "https://api.twitter.com/1.1/tweets/search/{}.json".format(endpoint+"/"+env_label) 

    #change params based on the endpoint you are using
    query_params = {'query': keyword, 'fromDate': start_date, 'toDate': end_date}
    return (search_url, query_params)

In [None]:
def connect_to_endpoint(url, headers, params, next_token = None):
    if next_token is not None and next_token != '':
      params['next'] = next_token
    response = requests.request("GET", url, headers = headers, params = params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [None]:
def get_data(keyword, start_time, end_time, next_token, env_label, endpoint):
  results = []
  
  while next_token is not None:
    ##this part here for one request
    url = create_url(keyword, start_time,end_time, env_label, endpoint)
    json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
    
    if "results" in json_response:
      results.extend(json_response["results"])
    ### up until this point
    if "next" in json_response:
        next_token = json_response["next"]
    else:
      next_token = None
    time.sleep(1)
  
  return results

In [None]:
def get_single_response(keyword, start_time, end_time, env_label, endpoint):
  results = []
  url = create_url(keyword, start_time,end_time, env_label, endpoint)
  json_response = connect_to_endpoint(url[0], headers, url[1])
  
  if "results" in json_response:
    results.extend(json_response["results"])
  
  return results


In [None]:
tweets = get_data("Sarajevo", "202002090000", "202002100000", "")

In [None]:
tweets = get_single_response("COVID19 lang:en", "202110060000", "202111050000", "30day", "30day")

### Inspect data

In [None]:
len(tweets)

In [None]:
tweets[0]

# Laboratory 2: working with Twitter data

First, we want to convert the data into Pandas DataFrame. This format enables us easy manipulation of the data as well as saving/loading data.

Since we have our tweets saved as a list of dictionaries, we can easily convert it to DataFrame by executing the cell blow.

In [None]:
tweets_df = pd.DataFrame(tweets)

### Saving the results

Once we have our Tweets as a DataFrame it is a good idea to save it on the disk. 

Be mindful of the fact that the storage of a Colab notebook is deleted everytime runtime is interrupted or restarted, so you need to manually download it to your computer or mount your Google Drive and save it there (this option is unavailable if you're using university's email account for Drive).

In [None]:
path = "/content/" #enter the path to your Drive or leave this as default

We can save it as a comma-separated values file, which enables opening it in a spreadsheet editor and inspecting it.

In [None]:
tweets_df.to_csv(path+"tweets.csv", index=False)

In order to preserve datatypes, we should save it as a parquet or pickle file.

In [None]:
tweets_df.to_pickle(path+"tweets.pkl")

### Loading the data

If you want to load the results you have previously saved, simply execute the next code, specifying the path to the file.

You will need to either upload it to the Colab workspace or copy the path to the file on Drive.

In [None]:
tweets_df = pd.read_pickle(path+"tweets.pkl")

### Preprocessing the data

In our dataframe we have the entire Tweet object. Some columns that might be of particular interest to us are: 

*   created_at - date when Tweet was posted
*   id/id_str - unique Tweet identifiers
*   text - the content of the Tweet
*   user - information about the user who posted the Tweet
*   retweeted_status  - information about the original Tweet
*   quote/reply/retweet/favorite count - Tweet metrics
*   entities - hashtags, urls, user_mentions present in Tweet

We can filter the dataframe and keep only columns we are interested in. You can pick which columns you'd like to keep and put them int the column_list below.



In [None]:
tweets_filtered = tweets_df.copy() #it's a good idea to work on the copy of original dataframe, so we can always go back to it if we mess something up
column_list = ["created_at", "id_str", "text", "user", "retweeted_status", "quote_count", "reply_count", "retweet_count", "favorite_count", "entities"]
tweets_filtered = tweets_filtered[column_list]

In [None]:
tweets_filtered

## Extracting words/hashtags

There are many ways to build networks from the data we download from Twitter.

One possibility is to have a bipartite network of Tweets and words/hashtags and then observe word, hashtag or word-hashtag projections.

### Extracting words

In order to extract words, we first need to clean the Tweet text. This way we will remove punctuation, hashtags/mentions/urls (they are preserved in the entity column anyway). We will also turn all letters to lowercase.

You can also consider removing stopwords, removing words that are not in the english language corpora, lematizing the words, etc. I suggest you research nltk library and its possibilities.

In [None]:
import re
import emoji
import string

In [None]:
def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) # remove mentions
    tweet = re.sub("#[A-Za-z0-9]+", "",tweet) # remove hashtags
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # remove http links
    tweet = " ".join(tweet.split())
    tweet = str.lower(tweet) #to lowercase
    table = str.maketrans(dict.fromkeys(string.punctuation)) 
    tweet = tweet.translate(table)# remove punctuation         
    return tweet

In [None]:
tweets_filtered["clean_text"] = tweets_filtered["text"].map(cleaner)

We are going to loop through the dataframe and then through the words in the clean text. We are going to add the words as keys to dictionary and use their frequencies as values.

In [None]:
#initialize an empty dict
unique_words = {}
for row in tweets_filtered.clean_text:
  for word in row.split(" "):
    #if the word is encountered for the first time add to dict as key and set its value to 0
    unique_words.setdefault(word,0)
    #increase the value (i.e the count) of the word by 1 every time it is encountered
    unique_words[word] += 1

In [None]:
#remove blank space
unique_words.pop(" ")
#remove word 'rt'
unique_words.pop("rt")

We can inspect the words as a dataframe. 


You can always save this dataframe as .csv for future reference.

In [None]:
uw_df = pd.DataFrame.from_dict(unique_words, orient='index').reset_index()
uw_df.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
uw_df.sort_values(by=['Count'], ascending=False, inplace=True)
uw_df

### Extracting the hashtags

We are going to loop through the dataframe and then through the hashtags in the entities. We are going to add the hashtags as keys to dictionary and use their frequencies as values. At the same time, we are going to save them in a list and add them to a separate column to facilitate our future work.

In [None]:
unique_hashtags = {}

tweets_filtered["hashtags"] = ""

for idx, row in tweets_filtered.iterrows():
  hashtag_list = []
  for hashtag in row["entities"]["hashtags"]:
    unique_hashtags.setdefault("#"+hashtag["text"], 0)
    unique_hashtags['#'+hashtag["text"]] += 1
    hashtag_list.append(hashtag["text"])
  tweets_filtered.at[idx,"hashtags"] = hashtag_list


In [None]:
uh_df = pd.DataFrame.from_dict(unique_hashtags, orient='index').reset_index()
uh_df.rename(columns = {'index':'Hashtag', 0:'Count'}, inplace=True)

In [None]:
uh_df

## Building the network

We are going to use the networkx library, which is a Python library that enables network science analysis of the data.

We are going to use it to create our network and extract edgelist from it, since we can easily import it to Gephi (a software we are going to see in visualization labs).

However, it offers implemented algorithms for analysis (for example PageRank) that you can use out-of-box to analyze your network.

But first, we will loop through our dataframe and connect words and hashtags if they appear together in the same Tweet.

In [None]:
import itertools
import networkx as nx

In [None]:
uh = unique_hashtags.keys()
uw = unique_words.keys()  

In [None]:
network = {}
network_key = 0

for index, row in tweets_filtered.iterrows():
    #hashtags extracted from Tweet do not have the # sign in front of them but we will add it to differentiate hashtags from words
    combined_list = ['#'+hashtag for hashtag in row["hashtags"] if '#'+hashtag in unique_hashtags] + [word for word in str.split(row["clean_text"], " ") if word in uw]
    #itertool product creates Cartesian product of each element in the combined list
    for pair in itertools.product(combined_list, combined_list):
        #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
        if pair[0]!=pair[1] and not(pair[::-1] in network):
            network.setdefault(pair,0)
            network[pair] += 1 #* row["retweetCount"]
    
network_df = pd.DataFrame.from_dict(network, orient="index")

In [None]:
network_df.reset_index(inplace=True)
network_df.columns = ["pair","weight"]
network_df.sort_values(by="weight",inplace=True, ascending=False)
network_df

In [None]:
#to get weighted graph we need a list of 3-element tuplels (u,v,w) where u and v are nodes and w is a number representing weight
up_weighted = []
for edge in network:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    up_weighted.append((edge[0],edge[1],network[edge]))

G = nx.Graph()
G.add_weighted_edges_from(up_weighted)

In [None]:
print(len(G.nodes()))
print(len(G.edges()))

#### SAVE EDGELIST

In [None]:
filename = "./edgelist.csv"

In [None]:
nx.write_weighted_edgelist(G, filename, delimiter=",")

In [None]:
#add header with appropriate column names (works on collab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist.csv

### Create Node List


In [None]:
word_nodes = pd.DataFrame.from_dict(unique_words,orient="index")
word_nodes.reset_index(inplace=True)
word_nodes["Label"] = word_nodes["index"]
word_nodes.rename(columns={"index":"Id",0:"delete"},inplace=True)
word_nodes = word_nodes.drop(columns=['delete'])

word_nodes

In [None]:
hashtag_nodes = uh_df.copy()
hashtag_nodes["Label"] = hashtag_nodes["Hashtag"]
hashtag_nodes.rename(columns={"Hashtag":"Id"},inplace=True)
hashtag_nodes = hashtag_nodes.drop(columns=['Count'])
hashtag_nodes

#### SAVE NODELIST

In [None]:
nodelist = hashtag_nodes.append(word_nodes, ignore_index=True)

nodelist.to_csv("nodelist.csv",index=False)

Tasks: 

*   Extract username of user who posted the tweet into a column "screen_name". Follow the procedure we used to get the hashtags.
*   Create a network of users using the mention relation. Is this a directed or undirected graph?
*   We created a network where nodes are mixed (both words and hashtags). Create network of words only and one of hashtags only.
* Pick one of these network and rank the nodes using PageRank centrality. Extract information about top-20 rated nodes.



