# Importing libraries and declare important variables

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpld3 import plugins
import mpld3
import networkx as nx
import operator
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact
%matplotlib inline
mpld3.enable_notebook()

# Declare main data variables
column_names = ['quest_id', 'quest_user_id', 'quest_accepted_ans_id', 'quest_creation_date', 'quest_score', 
                'quest_view_count', 'quest_last_act_date', 'quest_answer_count', 'quest_comment_count', 
                'quest_favorite_count', 'quest_id', 'ans_id', 'ans_user_id', 'ans_parent_id', 
                'ans_creation_date', 'ans_score', 'ans_last_act_date', 'ans_comment_count']
date_columns = ['quest_creation_date', 'quest_last_act_date', 'ans_creation_date', 'ans_last_act_date']

# Create a graph for an specific network using the user_ids as nodes
Here we are working with the question creator user id and the accepted answer user id.

In [None]:
@interact
def top_users(topic=['ros', 'android', 'apache-spark', 'arduino', 'c++', 'java', 'linux', 'mongodb', 
                     'python', 'raspberry-pi']):    
    qa = pd.read_csv('~/Downloads/vis_stackoverflow/filtered_data/' + topic + '/qa-' + topic + '.txt', '\t', 
                         names=column_names, parse_dates= date_columns)
    # Create a graph using the question user id and the accepted answer user id
    G = nx.from_pandas_dataframe(qa[['quest_user_id', 'ans_user_id']], 'quest_user_id', 'ans_user_id')
    # Computing the degree for every node
    deg = nx.degree(G)    
    df = pd.DataFrame.from_dict(deg, orient='index')
    df.columns = ['answered questions']
    # Ordering to find the most influential users
    df = df.sort_values(by='answered questions', ascending=False)
    print(topic + " results, most influential users are: \n")
    print(df.head(10))

# Plot the degree distribution of the selected graph

In [None]:
@interact

# def top_users(topic=['ros', 'android', 'apache-spark', 'arduino', 'c++', 'java', 'linux', 'mongodb', 'python', 'raspberry-pi']):   
def top_users(topic=['apache-spark']):       
    qa = pd.read_csv('~/Downloads/vis_stackoverflow/filtered_data/' + topic + '/qa-' + topic + '.txt', '\t', 
                         names=column_names, parse_dates= date_columns)
    # Create a graph using the question user id and the accepted answer user id
    G = nx.from_pandas_dataframe(qa[['quest_user_id', 'ans_user_id']], 'quest_user_id', 'ans_user_id')    
    # Compute the degree sequence
    degree_sequence=sorted(nx.degree(G).values(),reverse=True) # degree sequence    
    dmax=max(degree_sequence)
    # Plot the degree distribution
    plt.loglog(degree_sequence,'b-',marker='o')
    plt.title("Degree rank plot (" + topic + ")")
    plt.ylabel("degree")
    plt.xlabel("rank")


# Draw the graph
This drawing tool only works well while using small networks like the one for 'ros' topic or 'raspberry-pi', otherwise it might last a lot of time plotting.

In [None]:
topic = 'ros'
qa = pd.read_csv('~/Downloads/vis_stackoverflow/filtered_data/' + topic + '/qa-' + topic + '.txt', '\t', 
                     names=column_names, parse_dates= date_columns)

#Create a graph using the question user id and the accepted answer user id
G = nx.from_pandas_dataframe(qa[['quest_user_id', 'ans_user_id']], 'quest_user_id', 'ans_user_id')
pos = nx.spring_layout(G)
fig, ax = plt.subplots(figsize=(12, 9), subplot_kw=dict(facecolor='#EEEEEE'))
cent = nx.degree_centrality(G)
scatter = nx.draw_networkx_nodes(G, pos, ax=ax, node_size=[v * 20000 for v in cent.values()])
nx.draw_networkx_edges(G, pos, ax=ax)
# Setup plot and display
ax.grid(color='white', linestyle='solid')
ax.set_title("Node size represent their degree centrality", size=20)
labels = G.nodes()
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display()

# Plot the evolution of the topics over the years

In [None]:
# Create new figure
fig, ax = plt.subplots(figsize=(9, 7), dpi=100) 
fig.subplots_adjust(right=0.7)
ax.grid(True, alpha=0.3)

# Loop over array of topics
# topics = ['ros', 'arduino', 'apache-spark']    # smallest sets
topics = ['android', 'apache-spark', 'arduino', 'c++', 'java', 'linux', 'mongodb', 'python', 'raspberry-pi', 'ros']
for topic in topics:
    # Load data from files
    qa = pd.read_csv('~/Downloads/vis_stackoverflow/filtered_data/' + topic + '/qa-' + topic + '.txt', '\t', 
                     names=column_names, parse_dates= date_columns)
    monthly_count = qa.resample('M', on='quest_creation_date').count()
    monthly_sum = qa[['quest_creation_date', 'quest_score']].resample('M', on='quest_creation_date').sum().fillna(0)
    # Plot data using the same figure
    l, = ax.plot(monthly_count.index, monthly_count['quest_id'], label=topic)
    ax.fill_between(monthly_count.index, monthly_count['quest_id'] - (monthly_sum['quest_score'] / 20), 
                    monthly_count['quest_id'] + (monthly_sum['quest_score'] / 20), 
                    color=l.get_color(), alpha=.4)
    
# Define interactive legend
handles, labels = ax.get_legend_handles_labels() # return lines and labels
interactive_legend = plugins.InteractiveLegendPlugin(zip(handles, ax.collections), labels, alpha_unsel=0.5,
                                                     alpha_over=1.5, start_visible=True)
plugins.connect(fig, interactive_legend)
# Setup the axis and display the plot
ax.set_xlabel('Yearly/Monthly')
ax.set_ylabel('Nr of questions')  
mpld3.display()

# Scatter plot comparisson

In [None]:
# Create new figure
fig, ax = plt.subplots(figsize=(9, 7), dpi=100, subplot_kw=dict(facecolor='#EEEEEE')) 
# fig, ax = plt.subplots(subplot_kw=dict(facecolor='#EEEEEE')) 
labels = []; xs = []; ys = []; sizes = []
# fig.subplots_adjust(right=0.7)
ax.grid(True, alpha=0.3)

# Loop over array of topics
# topics = ['ros', 'arduino', 'apache-spark']    # smallest sets
topics = ['android', 'apache-spark', 'arduino', 'c++', 'java', 'linux', 'mongodb', 'python', 'raspberry-pi', 'ros']
for topic in topics:
    # Load data from files
    qa = pd.read_csv('~/Downloads/vis_stackoverflow/filtered_data/' + topic + '/qa-' + topic + '.txt', '\t', 
                     names=column_names, parse_dates= date_columns)
    # Append new values to arrays used in scatter plot    
    total_count = qa.count()    
    total_sum = qa[['quest_score', 'ans_score']].sum()        
    xs.append(total_sum['quest_score'])
    ys.append(total_sum['ans_score'])
    sizes.append(total_count['quest_id'] / 100)
    labels.append(topic + " (total questions = " + str(total_count['quest_id']) + ")")
    
# Create scatter plot
scatter = ax.scatter(xs,
                     ys,
                     c=np.random.random(size=len(topics)),
                     s= sizes,
                     alpha=0.3,
                     cmap=plt.cm.jet)
# Setup plot and display
ax.grid(color='white', linestyle='solid')
ax.set_title("Circle size represent total question number per topic", size=20)
ax.set_xlabel('Total question score')
ax.set_ylabel('Total answer score') 
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display()