<a href="https://colab.research.google.com/github/CALDISS-AAU/sdsphd19_coursematerials/blob/master/notebooks/networks_exercise_instagram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preamble

In [0]:
# STandard stuff
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import itertools # Python's amazing iteration & combination library

In [0]:
# For visualization
!pip install -U bokeh
!pip install -q holoviews

# Import the libraries and link to the bokeh backend
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show

# Setting the default figure size a bit larger
defaults = dict(width=750, height=750, padding=0.1,
                xaxis=None, yaxis=None)
hv.opts.defaults(
    opts.EdgePaths(**defaults), opts.Graph(**defaults), opts.Nodes(**defaults))

In [0]:
# Network Stuff
import networkx as nx
import community # `python-louvain` is implemented here
from networkx.algorithms import bipartite # bipartite NW algos

In [0]:
# Blockmodel Stuff
!wget https://github.com/CALDISS-AAU/sdsphd19_coursematerials/raw/master/wednesday_network-blockmodeling/blockmodeling_material.zip # downloading module and data files to googe drive session
!unzip 'blockmodeling_material.zip' # unzipping

# import the necessary modules
import blockmodeling as bm
import matplotlib.pyplot as plt
import scipy as sc
import scipy.cluster.hierarchy as sch

In [0]:
# API&Scraping&instagramm
!pip3 install instaloader # Installing instaloader
import instaloader
L = instaloader.Instaloader()

import requests as rq # The requests library handles "requests" to APIs similar to a browser that requests a webpage given a URL
from nltk.tokenize import TweetTokenizer # A bit of a transition into NLP. The tweet tokenizer from the NLTK library will help us extract the hashtags from post-text
tknzr = TweetTokenizer()


# Task

So guys, now its time to put it all together. Take the two notebooks by Carl, and Daniel, and Carl, and Ddo the following:

1. Extract Instagram Tag infos of your choice
2. Generate a bipartite User-Tag network
3. Project it on either the user or the tag mode (your choice)
4. Apply blockmodeling on it 

# Getting the data

In [0]:
# Instagram base url preffix
tagurl_prefix = 'https://www.instagram.com/explore/tags/'

# suffix to append to tag request url to retrieve data in JSON format
tagurl_suffix = '/?__a=1'

# suffix to end cursor when requesting posts by tag
tagurl_endcursor = '&max_id='

# a generic media post preffix (concat with media shortcode to view)
posturl_prefix = 'https://www.instagram.com/p/'

In [0]:
#
# Find your own instagramm tag to explore!!!!!
#
tags = ['XXXXXXXXX']

In [0]:
# urls to initial tags using the above url-components
queries = [ tagurl_prefix + tag + tagurl_suffix for tag in tags ]

In [0]:
# Getting the data
edges = []
for q in queries:    
    for i in range(10): # how many iterations/deepth ?
      r = rq.get(q).json()
      end_cursor = r['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
      edges.extend(r['graphql']['hashtag']['edge_hashtag_to_media']['edges'])
      print(i)
      q = q + tagurl_endcursor + end_cursor

In [0]:
edges = []
for q in queries:    
    for i in range(10): # how many iterations/deepth ?
      r = rq.get(q).json()
      end_cursor = r['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
      edges.extend(r['graphql']['hashtag']['edge_hashtag_to_media']['edges'])
      print(i)
      q = q + tagurl_endcursor + end_cursor

In [0]:
post_dicts = [] #empty list

for post in edges: #iterate all raw posts

  if post['node']['edge_media_to_caption']['edges'] == []: # hop to the next if no text in the post
    continue
    
  post_dict = {} # empty dictionary
  id_owner = post['node']['owner']['id'] # pick out user-id
  shortcode = post['node']['shortcode'] # pick out short post identifier
  text = post['node']['edge_media_to_caption']['edges'][0]['node']['text'] # pick out post text
  
  # Pick hashtags from text
  tokens = tknzr.tokenize(text)
  tags = [x.strip('#') for x in tokens if x.startswith('#')]

  # fill in dictionary with values
  post_dict['id_owner'] = id_owner
  post_dict['shortcode'] = shortcode
  post_dict['tags'] = tags
  post_dict['text'] = text

  post_dicts.append(post_dict) #append the dictionary to a list of post-dictionaries

In [0]:
# Create DF
posts_df = pd.DataFrame(post_dicts)

# Remove hashtags that are not a hashtag (emptyspace & mistakes)
posts_df['tags'] = posts_df['tags'].map(lambda t: [x for x in t if x.isalnum()])

# Kick out posts with 0 hashtags
posts_df = posts_df[posts_df['tags'].map(len) != 0]

# Create a graph

In [0]:
# Create a new graph
B = nx.Graph()
# We need to specify the nodes for level 0 - this will be our users
B.add_nodes_from(list(set(posts_df.id_owner)), bipartite= ***)
# Then we need to add hashtags nodes as level 1 nodes
B.add_nodes_from(list(set(itertools.chain(*posts_df.tags))), bipartite= ***)

In [0]:
# This quick loop will generate edges between users and hashtags
# Every time someone mentions a #hashtag, a link is created

bi_edges = []
for i in posts_df[['id_owner','tags']].iterrows(): # we do this row-by-row since each row is a post
  id_owner = i[1]['id_owner']
  for j in i[1]['tags']:
    bi_edges.append((id_owner, j)) # edges are appended to a list as a tuple (id_owner, hashtag)

# Let's add the edges to our graph
B.add_edges_from(bi_edges)

In [0]:
# Extract a set of nodes with level 0
top_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==0}

# the remaining nodes are then level 1
bottom_nodes = set(B) - top_nodes

# How your turn!

In [0]:
# Let's project this graph using a weighted projection
G_proj = bipartite.weighted_projected_graph(B, ***)