In [None]:
%load_ext autoreload
%autoreload 2

# Introduction

We will use Deep Walk (which is a concept based of Word Embeddings) to cluster author networks.

In [None]:
! pip install pyvis

# Load the Libraries

In [None]:
import pandas as pd 
import numpy as np 
from datetime import datetime
import sys
import ast

import plotly.express as px

import matplotlib.pyplot as plt
import seaborn as sns



import networkx as nx
from networkx.algorithms.components.connected import connected_components

import json
import dask.bag as db

import sys
import os

sys.path.append("..")

from pathlib import Path

import json


from itertools import combinations
from collections import Counter
from itertools import chain
import random

from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations

import multiprocessing
import smart_open

from gensim.models.word2vec import Word2Vec

from pyvis.network import Network

from IPython.core.display import display, HTML

from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt


# Extract the Data from Kaggle 

In [None]:
# Extract Only the AI , ML PAPERS
def extractArxivData(categories=['stat.ML','cs.AI'],year=None,raw_data_path="../data/raw/",save_extracted_filename="../data/processed/AI_ML.json"):
    """ This function extracts data for the given set of categories and save the data into the save_extracted_filename path """
    records=db.read_text(raw_data_path+"/*.json").map(lambda x:json.loads(x))
    docs = (records.filter(lambda x:any(ele in x['categories'] for ele in categories)==True))
    extract_latest_version=lambda x:x['versions'][-1]["created"]
    if year!=None:
        docs=docs.filter(lambda x:int(extract_latest_version(x).split(" ")[3])>=year)

    get_metadata = lambda x: {'id': x['id'],
                  'title': x['title'],
                  'category':x['categories'],
                  'abstract':x['abstract'],
                 'version':x['versions'][-1]['created'],
                         'doi':x["doi"],
                         'authors_parsed':x['authors_parsed']}
                        
    data=docs.map(get_metadata).to_dataframe().compute()

    ## Creating authors fields by joining first and last nmes in authors_parsed columns.
    data['authors']=data['authors_parsed'].apply(lambda authors:[(" ".join(author)).strip() for author in authors])

    print("Number of Records Extracted for Given Set of Categories ",data.shape[0])
    Path(os.path.dirname(save_extracted_filename)).mkdir(parents=True, exist_ok=True)
    data.to_json(save_extracted_filename,orient="records")
    return data


In [None]:
RAW_DATA_PATH="../input/arxiv/"


In [None]:
## Collect data for Papers published in ['stat.ML','cs.AI'] since year 2015.
data=extractArxivData(categories=['stat.ML','cs.AI'],year=2015,raw_data_path=RAW_DATA_PATH,save_extracted_filename="AI_ML_since2015.json")

# Creating a Co-Author Network

For the set of papers extracted, for every pair of authors an edge is to be created. The Edge weight will be the number of papers the two authors have collabrated on. 

## Load the Data


In [None]:
data['author_pairs']=data['authors'].apply(lambda x:list(combinations(x, 2)))
data.head()

## We consider authors who have published papers after 2015 and published more than 3 papers.


In [None]:
def flattenList(nested_list):
    flat_list = [item for sublist in nested_list for item in sublist]
    return flat_list

In [None]:
ai_authors=pd.DataFrame(flattenList(data['authors'].tolist())).rename(columns={0:'authors'})
papers_by_authors=ai_authors.groupby(['authors']).size().reset_index().rename(columns={0:'Number of Papers Published'}).sort_values("Number of Papers Published",ascending=False)
papers_by_authors.shape

In [None]:
papers_by_authors['Number of Papers Published'].describe()


In [None]:
## Keeping Authors who have published more than 3 Papers
nodes_to_keep=papers_by_authors.loc[papers_by_authors['Number of Papers Published']>3,'authors'].tolist()
len(nodes_to_keep)

In [None]:
print(nodes_to_keep)

### Generating the Edges of the Co-Author Network

In [None]:
authors_pairs=data['author_pairs'].tolist()
authors_edge_list=[item for sublist in authors_pairs for item in sublist]
authors_weighted_edge_list=list(Counter(authors_edge_list).items())
authors_weighted_edge_list=[(row[0][0],row[0][1],row[1]) for idx,row in enumerate(authors_weighted_edge_list)]
authors_weighted_edge_list[0:10]

### Creating the Graph on the Complete Data

In [None]:
G1=nx.Graph()
G1.add_weighted_edges_from(authors_weighted_edge_list)
print(len(G1.nodes()))

### Filtering the Graph, to keep nodes (authors) who have atleast published 4 papers. We also remove any isolated nodes.

In [None]:
## From the complete Graph, create a subgraph, with only the nodes to keep
sub_g=nx.subgraph(G1,nodes_to_keep)
G=nx.Graph(sub_g)
print(len(G.nodes()))
isolated_node=nx.isolates(G)
len(list(isolated_node))

In [None]:
G.remove_nodes_from(list(nx.isolates(G)))
len(G.nodes)

In [None]:
del G1, sub_g

In [None]:
print("Number of Nodes in Author Graph ",len(G.nodes()))
print("Number of Edges in AUthor Graph ",len(G.edges()))

## Implementing Deep Walk

**Deep walk uses the concept of Random Walks to assign an embedding to each node in the network.** 

1. In Random Walk, given a node we pick one of its neighbours at random and move to this node and from this node again choose another node among its neighbours at random. This continues for a fixed number of steps. 



2. Once we have random walks generated for every node in the network, in DeepWalk the next step is to predict probability of visiting node "v" on a random walk starting from node "u". 
 
3. This is very similar to the Skip-Gram model used in Word2Vec Model in NLP, wherein we try to predict the neighbouring words given a particular target word.

In [None]:
def getRandomWalk(graph,node,length_of_random_walk):
    """ This function takes NetworkX Graph and a Node and generate random walk for a given length 
    
    Returns the random walk (list of nodes traversed)

    Note: The same node may occcur more than once in a Random Walk.
    """
    start_node=node
    current_node=start_node
    random_walk=[node]
    for i in range(0,length_of_random_walk):
        ## Choose a random neighbour of the current node
        
        current_node_neighbours=list(graph.neighbors(current_node))
        chosen_node=random.choice(current_node_neighbours)
        current_node=chosen_node
        random_walk.append(current_node)
    return random_walk




In [None]:
### For every Node in the Graph, get randomwalks . For eahc node, let us get random walks say around 10 times each of path length 10
num_sampling= 10
random_walks=[]
length_of_random_walk= 3
for node in tqdm(G.nodes(),desc="Iterating Nodes"):
    for i in range(0,num_sampling):
        random_walks.append(getRandomWalk(G,node,length_of_random_walk))

The data now is similar to list of words in a sentence and we can use gensim to create Node Embedding Model - here each author is a Node and Node is similar to word in a sentence

In [None]:
deepwalk_model=Word2Vec(sentences=random_walks,window=5,sg=1,negative=5,vector_size=128,epochs= 20,compute_loss=True)

In [None]:
deepwalk_model.save("deepwalk_since2015.model")

## Similar authors 

In [None]:
def getSimilarNodes(model,node):
    """
    This function takes deepwalk model and a node
    
    Returns the top 10 nodes (author) similar to the given node 
    """
    similarity=model.wv.most_similar(node)
    similar_nodes=pd.DataFrame()
    similar_nodes['Similar_Node']=[row[0] for i,row in enumerate(similarity)]
    similar_nodes['Similarity_Score']=[row[1] for i,row in enumerate(similarity)]
    similar_nodes['Source_Node']=node
    return similar_nodes



In [None]:
getSimilarNodes(deepwalk_model,"Bengio Yoshua")