# Stack Exchange Tag Recommendation 

# Importing Libraries

In [3]:
import json 
import requests
import networkx as nx
import matplotlib.pyplot as plt
import itertools

## Extracting data through API call 

In [4]:

json_list = []

def extract_raw_json():
    
    global json_list
    # make requests for 100 pages of size 100 records each 
    for i in range(1,101):
        url = f'https://api.stackexchange.com/2.2/questions?page={i}&pagesize=100&order=desc&sort=activity&site=datascience&key=4*MX83rbQ8xfUvuY*49ZKw(('
        data = requests.get(url)
        questions_api = data.json()
        json_list.append(questions_api)
    
    #Save the list of extracted records in a json file
    with open('stackExchangeAPI.json', 'w+') as f:
        json.dump(json_list, f, indent=4)
    

## Extracting and storing Tags from records

In [5]:

def store_tags():
    #Read each page and record in the file and extract only the tags
    with open('stackExchangeAPI.json','r') as json_file:
        tags_list = []
        data = json.load(json_file)
        for i in range(0,99): #Loop through 100 pages
            for j in range(0,99): # Loop through 100 records in each page
              tags = data[i]["items"][j]["tags"]
              with open('tags.txt','a+') as tags_file: 
                    tags_file.seek(0)
                    d = tags_file.read()
                    #If file contains data , add the record in a newline
                    if len(d) > 0 :
                        tags_file.write("\n")
                    out = ','.join(f'{tags[i]}' for i in range(0,(len(tags))))
                    tags_file.write(out)

In [6]:
#Get list of Unique Tags from tags.txt

unique_tags = []

def extract_unique_tags():
    
    global unique_tags
    fp  = open('tags.txt')
    #Read each line , extract tags split by a comma 
    tags = [word.strip() for line in fp.readlines() for word in line.split(',') if word.strip()]

    print(" Total number of tags : " , len(tags))
    unique_tags = set(tags)   # Find the unique tags
    print(" Number of Unique tags : " , len(unique_tags))


## Build a Weighted graph of Tags

In [7]:
#Create an empty graph with no vertices
G = nx.Graph()


def create_nodes():
    global G
    #Create nodes for G 
    for tag in unique_tags:
        G.add_node(tag)
    


def draw_edges():
    global G
    related_tags_in_record = []
    
    fp  = open('tags.txt')
    lines = fp.readlines()

    for line in lines:
            line.strip()
            related_tags_in_record.clear()
            related_tags_in_record.append(line.split(','))
            related_tags_in_record[0] = [e.replace('\n','')for e in related_tags_in_record[0]]
        
            #Checking for edges in all possible pairs of tags in a record
            for i in range(len(related_tags_in_record[0])):
                for j in range(i+1,len(related_tags_in_record[0])):
                    if G.has_edge(related_tags_in_record[0][i],related_tags_in_record[0][j]):
                    # If edge is already present , increase the edge weight by one
                        G[related_tags_in_record[0][i]][related_tags_in_record[0][j]]['weight'] += 1
                    else:
                    # else , draw a new edge
                        G.add_edge(related_tags_in_record[0][i],related_tags_in_record[0][j], weight=1)

    edges_list = G.edges.data('weight', default=1)
    edges_list = list(edges_list)
    print(edges_list[:20])

    #Pickling the graph 
    nx.write_gpickle(G,"tags_graph.pickle")



# Test Sample Query

In [8]:
#Query the neighbouring nodes(connected) of the query 

def sample_query(query_tag): 
    edges_of_query = G[query_tag]
    
    #Sort the edges based on edge weights
    neighbours = sorted(edges_of_query.items(), key=lambda edge: edge[1]['weight'],reverse=True)

    print(neighbours[:10])
    

In [7]:
if __name__ == '__main__':
    extract_raw_json()
    store_tags()
    

In [8]:
print(json.dumps(json_list[0]["items"][:4], indent=4))

[
    {
        "tags": [
            "xgboost",
            "boosting",
            "grid-search"
        ],
        "owner": {
            "reputation": 261,
            "user_id": 67931,
            "user_type": "registered",
            "profile_image": "https://www.gravatar.com/avatar/dd0f8e431d78371294bbc8aed3cf607f?s=128&d=identicon&r=PG&f=1",
            "display_name": "Maths12",
            "link": "https://datascience.stackexchange.com/users/67931/maths12"
        },
        "is_answered": false,
        "view_count": 2,
        "answer_count": 0,
        "score": 0,
        "last_activity_date": 1597336759,
        "creation_date": 1597336759,
        "question_id": 80243,
        "content_license": "CC BY-SA 4.0",
        "link": "https://datascience.stackexchange.com/questions/80243/why-do-i-need-to-find-update-number-of-boosting-rounds-each-time-i-update-a-para",
        "title": "why do i need to find update number of boosting rounds each time i update a parameter in xg

In [9]:
extract_unique_tags()

 Total number of tags :  123647
 Number of Unique tags :  589


In [10]:
print(list(unique_tags)[:15])

['cart', 'ensemble', 'automatic-summarization', 'corpus', 'heatmap', 'weighted-data', 'non-parametric', 'homework', 'genetic-programming', 'data-engineering', 'convergence', 'json', 'hana', 'etl', 'rstudio']


In [11]:
create_nodes()

nodes = list(G.nodes)

print(nodes[:20])

['cart', 'ensemble', 'automatic-summarization', 'corpus', 'heatmap', 'weighted-data', 'non-parametric', 'homework', 'genetic-programming', 'data-engineering', 'convergence', 'json', 'hana', 'etl', 'rstudio', 'search', 'one-hot-encoding', 'anomaly-detection', 'alex-net', 'hypothesis-testing']


In [12]:
 draw_edges()

[('cart', 'decision-trees', 13), ('cart', 'boosting', 1), ('cart', 'random-forest', 8), ('cart', 'r', 4), ('cart', 'imbalanced-learn', 4), ('cart', 'machine-learning', 4), ('ensemble', 'classification', 8), ('ensemble', 'prediction', 4), ('ensemble', 'ensemble-modeling', 17), ('ensemble', 'binary', 4), ('ensemble', 'grid-search', 4), ('ensemble', 'ensemble-learning', 2), ('ensemble', 'neural-network', 8), ('ensemble', 'regression', 8), ('ensemble', 'multi-output', 4), ('ensemble', 'machine-learning', 27), ('ensemble', 'python', 8), ('ensemble', 'scikit-learn', 4), ('ensemble', 'xgboost', 4), ('ensemble', 'boosting', 8)]


In [33]:
sample_query('machine-learning')

[('python', {'weight': 2186}), ('deep-learning', {'weight': 2160}), ('neural-network', {'weight': 1710}), ('classification', {'weight': 1162}), ('scikit-learn', {'weight': 872}), ('keras', {'weight': 820}), ('regression', {'weight': 612}), ('nlp', {'weight': 610}), ('tensorflow', {'weight': 606}), ('time-series', {'weight': 566})]


In [34]:
sample_query('deep-learning')

[('machine-learning', {'weight': 2707}), ('neural-network', {'weight': 1943}), ('keras', {'weight': 1269}), ('tensorflow', {'weight': 936}), ('python', {'weight': 814}), ('cnn', {'weight': 781}), ('lstm', {'weight': 499}), ('nlp', {'weight': 476}), ('image-classification', {'weight': 406}), ('classification', {'weight': 372})]


In [35]:
sample_query('nlp')

[('machine-learning', {'weight': 765}), ('python', {'weight': 532}), ('deep-learning', {'weight': 476}), ('word-embeddings', {'weight': 331}), ('text-mining', {'weight': 285}), ('bert', {'weight': 271}), ('neural-network', {'weight': 235}), ('natural-language-process', {'weight': 225}), ('classification', {'weight': 199}), ('word2vec', {'weight': 198})]


In [14]:
nx.write_gexf(G, "tags_graph.gexf")